在.Net 8中Enumerable.Range性能改进
前言
还有一周就要发布.Net 8了,把一些早期记录感兴趣的代码,顺便记录下来.今天主要说一些Enumerable.Range在.Net 8中性能改进.- Vectorize Enumerable.Range initialization, take 2 (#87992)

测试代码示例
using System.Linq;
using BenchmarkDotNet.Attributes;
namespace CSharpBenchmarks.ArrayTest
{
[MemoryDiagnoser]
[DisassemblyDiagnoser(printSource: true)]
public class EnumerableTest
{
[Params(16, 128, 1024)]
public int Count { get; set; }
[Benchmark]
public void RangeTest()
{
for (int i = 0; i < Count; i++)
{
var arr = Enumerable.Range(0, Count).ToArray(); //Enumerable.Range使用
for (int j = 0; j < arr.Length; j++)
{
if (arr[j] != 0)
{
}
}
}
}
}
}
看BenchmarkDotNet测试结果:
Range在.Net7和.Net 8性能测试对比中,提升还是很明显的.
- 执行16次,对比.Net 7的性能是提升24%
- 执行128次和1024次时,对比.Net 7的性能提升在38%左右
接下来我们看一下.Net 8生成的汇编代码(生成的汇编代码较多,就不展示.Net 7生成的汇编代码了):
; CSharpBenchmarks.ArrayTest.EnumerableTest.RangeTest()
; for (int i = 0; i<Count; i++)
; ^^^^^^^^^
; var arr = Enumerable.Range(0, Count).ToArray();
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
; for (int j = 0; j<arr.Length; j++)
; ^^^^^^^^^
; if (arr[j] != 0)
; ^^^^^^^^^^^^^^^^
push rdi
push rsi
push rbp
push rbx
sub rsp,38
xor eax, eax
mov[rsp + 28],rax
mov[rsp + 30], rax
mov rbx, rcx
xor esi, esi
cmp dword ptr[rbx + 8],0
jle near ptr M00_L05
M00_L00:
mov edi,[rbx + 8]
movsxd rcx, edi
dec rcx
test edi, edi
jl near ptr M00_L06
cmp rcx,7FFFFFFF
jg near ptr M00_L06
test edi,edi
je near ptr M00_L08
mov rcx,offset MT_System.Linq.Enumerable+RangeIterator
call CORINFO_HELP_NEWSFAST
mov rbp, rax
call CORINFO_HELP_GETCURRENTMANAGEDTHREADID
mov [rbp+8], eax
xor edx, edx
mov [rbp+14], edx
mov [rbp+18], edi
M00_L01:
test rbp, rbp
je near ptr M00_L07
mov rdx, rbp
mov rcx, offset MT_System.Linq.IIListProvider`1[[System.Int32, System.Private.CoreLib]]
call qword ptr[7FFB30974348]; System.Runtime.CompilerServices.CastHelpers.IsInstanceOfInterface(Void*, System.Object)
test rax,rax
je near ptr M00_L09
mov rdx,offset MT_System.Linq.Enumerable+RangeIterator
cmp [rax], rdx
jne near ptr M00_L10
mov ebp, [rax+14]
mov edx, [rax+18]
sub edx, ebp
movsxd rdx, edx
mov rcx, offset MT_System.Int32[]
call CORINFO_HELP_NEWARR_1_VC
mov rdi, rax
lea rcx, [rdi+10]
mov edx, [rdi+8]
mov [rsp+28], rcx
mov [rsp+30], edx
lea rcx, [rsp+28]
mov edx, ebp
;.Net 8中新增Fill填充数据,并且使用SIMD
call qword ptr[7FFB310D7BE8]; System.Linq.Enumerable+RangeIterator.Fill(System.Span`1<Int32>, Int32)
M00_L02:
xor ecx, ecx
mov edx,[rdi + 8]
test edx, edx
jle short M00_L04
M00_L03:
inc ecx
cmp edx, ecx
jg short M00_L03
M00_L04:
inc esi
cmp esi,[rbx + 8]
jl near ptr M00_L00
M00_L05:
add rsp,38
pop rbx
pop rbp
pop rsi
pop rdi
ret
M00_L06:
mov ecx,1
call qword ptr[7FFB30CBFC00]
int 3
M00_L07:
mov ecx,10
call qword ptr[7FFB30CBFBE8]
int 3
M00_L08:
mov rcx,7FFB30AD0A60
mov edx,4
call CORINFO_HELP_CLASSINIT_SHARED_DYNAMICCLASS
mov rcx,25AB0C01EA0
mov rbp,[rcx]
jmp near ptr M00_L01
M00_L09:
mov rcx, rbp
call qword ptr[7FFB310D7E28]
mov rdi,rax
jmp short M00_L02
M00_L10:
mov rcx, rax
mov r11,7FFB30810730
call qword ptr[r11]
mov rdi, rax
jmp near ptr M00_L02
; Total bytes of code 356
; System.Linq.Enumerable+RangeIterator.Fill(System.Span`1<Int32>, Int32)
vzeroupper
mov rax,[rcx]
mov ecx,[rcx + 8]
mov r8d, ecx
lea r8,[rax + r8 * 4]
cmp ecx,8
jl short M02_L02
vmovd xmm0, edx
vpbroadcastd ymm0, xmm0
vpaddd ymm0, ymm0,[7FFB30FC3AC0]
vmovups ymm1,[7FFB30FC3AE0]
lea rdx,[r8 - 20]
nop dword ptr[rax]
nop dword ptr[rax]
M02_L00:
vmovups[rax],ymm0
vpaddd ymm0,ymm0,ymm1
add rax,20
cmp rax, rdx
jbe short M02_L00
vmovd edx, xmm0
cmp rax, r8
jb short M02_L03
M02_L01:
vzeroupper
ret
M02_L02:
cmp rax, r8
jae short M02_L01
M02_L03:
lea ecx,[rdx + 1]
mov[rax],edx
add rax,4
mov edx, ecx
jmp short M02_L02
; Total bytes of code 112
通过生成的汇编代码对比,发现在.Net 8新增了Fill方法填充数据,并且是使用了SIMD(Single Instruction Multiple Data,单指令流多数据流),接下来我们看看最新的源码:
public int[] ToArray()
{
int start = _start;
int[] array = new int[_end - start];
Fill(array, start); //ToArray先分配一个数组后,使用Fill填充数据
return array;
}
public List<int> ToList()
{
(int start, int end) = (_start, _end);
List<int> list = new List<int>(end - start);
Fill(SetCountAndGetSpan(list, end - start), start); //ToList先创建一个List,使用Fill填充数据
return list;
}
private static void Fill(Span<int> destination, int value)
{
ref int pos = ref MemoryMarshal.GetReference(destination);
ref int end = ref Unsafe.Add(ref pos, destination.Length);
if (Vector.IsHardwareAccelerated &&
Vector<int>.Count <= 8 &&
destination.Length >= Vector<int>.Count)
{
//初始化集合值
Vector<int> init = new Vector<int>((ReadOnlySpan<int>)[0, 1, 2, 3, 4, 5, 6, 7]);
// 先用初始化vector中的值,让后加上init的值,current中的为: 0, 1, 2, 3, 4, 5, 6, 7
Vector<int> current = new Vector<int>(value) + init;
//将vector中的值,用Vector<int>.Count当作初始值,这里8
Vector<int> increment = new Vector<int>(Vector<int>.Count);
//根据end,减去Vector<int>.Count,计算出循环结束的位置
ref int oneVectorFromEnd = ref Unsafe.Subtract(ref end, Vector<int>.Count);
do
{
//将current的值保存pos开始,到pos+Vector<int>.Count的
current.StoreUnsafe(ref pos);
//将current中的值 0, 1, 2, 3, 4, 5, 6, 7+都加上8
current += increment;
//移动pos位置,将pos+Vector<int>.Count
pos = ref Unsafe.Add(ref pos, Vector<int>.Count);
}
while (!Unsafe.IsAddressGreaterThan(ref pos, ref oneVectorFromEnd));
value = current[0];
}
while (Unsafe.IsAddressLessThan(ref pos, ref end))
{
pos = value++;
pos = ref Unsafe.Add(ref pos, 1);
}
}
秋风
2023-11-08