在.Net 7 正则表达式性能改进

起因

前天.Net 7发布了preview 2,昨天就卸载了preview 1,然后安装preview 2,今天看到微软博客有关于.Net 7 preview 2改进的相关介绍.便使用BenchmarkDotNet进行基准测试.

测试代码

using System.Text.RegularExpressions;
using BenchmarkDotNet.Attributes;

namespace net6perf.Synax
{
    [DisassemblyDiagnoser(printSource: true, maxDepth: 3)]
    [MemoryDiagnoser]
    public partial class RegexTest
    {
        [Params(1024, 2048, 4096)]
        public int Count { get; set; }

        [Params("qwe", "abc", "xyz")]
        public string? Input { get; set; }

        public Regex regex = new Regex(@"abc|def", RegexOptions.IgnoreCase);

        //在.Net 7加入 正则表达式代码生成,简单不单独说明怎么使用
        //1. 使用RegexGenerator特性
        [RegexGenerator(@"abc|def", RegexOptions.IgnoreCase)]

        //2. 使用static和partial进行声明
        public static partial Regex MyRegex();



        [Benchmark(Baseline = true)]
        public void Before()
        {
            int sum = 0;
            for (int i = 0; i < Count; i++)
            {
                if (regex.IsMatch(Input!))
                {
                    sum += i;
                }
            }
        }

        [Benchmark]
        public void After()
        {
            int sum = 0;
            for (int i = 0; i < Count; i++)
            {
                if (MyRegex().IsMatch(Input!))
                {
                    sum += i;
                }
            }
        }
    }
}

基准测试结果:

在.Net 7中正则表达式使用新方式性能对比

在另外一台机器测试,得出的结果也差不多.可以看到使用新方式,性能最高提升54%,最低也有20%的提升.通过代码生成的方式对性能提升还是可以的.

为什么是代码生成呢?

反编译一下程序集(代码):
public class Program
{
    [GeneratedCode("System.Text.RegularExpressions.Generator", "7.0.6.15202")]
    [EditorBrowsable(EditorBrowsableState.Never)]
    private sealed class GeneratedRegex_MyRegex_D859DD68 : Regex   //1. 继承Regex
    {
        private sealed class RunnerFactory : RegexRunnerFactory    //2. 继承RegexRunnerFactory
        {
            private sealed class Runner : RegexRunner              //3. 继承RegexRunner
            {
                protected override void InitTrackCount()           //4. 重写InitTrackCount
                {
                    runtrackcount = 7;              //确定表达式的长度
                }

                protected override bool FindFirstChar()            //5. 重写FindFristChar
                {
                    int pos = runtextpos;
                    int end = runtextend;
                    ReadOnlySpan<char> inputSpan = runtext;
                    if (pos < end - 2)
                    {
                        ReadOnlySpan<char> span = inputSpan.Slice(pos, end - pos);
                        int i;
                        for (i = 0; i < span.Length - 2; i++)
                        {
                            int indexOfPos = span.Slice(i + 2).IndexOfAny("CFcf"); //CFcf是根据匹配内容由编译器生成, ab c| de f
                            if (indexOfPos < 0)
                            {
                                break;
                            }
                            i += indexOfPos;
                            if (i >= span.Length - 2)
                            {
                                break;
                            }
                            char ch;
                            if ((ch = span[i]) < '\u0080' && ("\0\0\0\0\u0012\0\u0012\0"[(int)ch >> 4] & (1 << (ch & 0xF))) != 0 && (ch = span[i + 1]) < '\u0080' && ("\0\0\0\0$\0$\0"[(int)ch >> 4] & (1 << (ch & 0xF))) != 0)
                            {
                                runtextpos = pos + i;
                                return true;
                            }
                        }
                    }
                    runtextpos = end;
                    return false;
                }

                [SkipLocalsInit]  //这里有优化,使用SkipLocalsInit特性,对方法内局部变量不初始化
                protected override void Go()
                {
                    ReadOnlySpan<char> inputSpan = runtext;
                    int pos = runtextpos;
                    int end = runtextend;
                    int original_pos = pos;
                    TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo;
                    ReadOnlySpan<char> slice = inputSpan.Slice(pos, end - pos);
                    if (slice.IsEmpty)
                    {
                        return;
                    }
                    switch (slice[0]) //有编译器编译时根据表达式生成
                    {
                        default:
                            return;
                        case 'A':
                        case 'a':
                            if ((uint)slice.Length < 3u || (slice[1] | 0x20) != 98 || (slice[2] | 0x20) != 99)
                            {
                                return;
                            }
                            pos += 3;
                            slice = slice.Slice(3);
                            break;
                        case 'D':
                        case 'd':
                            if ((uint)slice.Length < 3u || (slice[1] | 0x20) != 101 || (slice[2] | 0x20) != 102)
                            {
                                return;
                            }
                            pos += 3;
                            slice = slice.Slice(3);
                            break;
                    }
                    runtextpos = pos;
                    Capture(0, original_pos, pos);
                }
            }

            protected override RegexRunner CreateInstance()
            {
                return new Runner();  
            }
        }

        //单例
        public static Regex Instance { get; } = new GeneratedRegex_MyRegex_D859DD68();


        private GeneratedRegex_MyRegex_D859DD68()
        {
            pattern = "abc|def";  //匹配表达式
            roptions = RegexOptions.IgnoreCase | RegexOptions.Compiled; //编译选项
            internalMatchTimeout = Timeout.InfiniteTimeSpan;
            factory = new RunnerFactory();
            capsize = 1;
        }
    }

    [RegexGenerator("abc|def", RegexOptions.IgnoreCase)]
    [GeneratedCode("System.Text.RegularExpressions.Generator", "7.0.6.15202")]
    public static Regex MyRegex()
    {
        return GeneratedRegex_MyRegex_D859DD68.Instance;
    }

    private static void Main(string[] args)
    {
        Console.WriteLine(MyRegex().IsMatch("xyz"));
    }
}

调用流程图

在.Net 7中正则表达式使用新方式内部执行流程图
秋风 2022-03-18