CharsetDetector · harnel-tngn · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs
@@ -117,6 +117,20 @@ private CharsetDetector()
         _lastChar = 0x00;
     }
 
+    /// <summary>
+    /// Detect the character encoding of these bytes.
+    /// It searches for BOM from the start of the span.
+    /// Slice the span before passing it if only a range should be inspected.
+    /// </summary>
+    /// <param name="bytes">The bytes containing the text</param>
+    /// <returns></returns>
+    public static DetectionResult DetectFromBytes(ReadOnlySpan<byte> bytes)
+    {
+        var detector = new CharsetDetector();
+        detector.Feed(bytes);
+        return detector.DataEnd();
+    }
+
     /// <summary>
     /// Detect the character encoding form this byte array.
     /// It searches for BOM from bytes[0].
@@ -130,9 +144,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
             throw new ArgumentNullException(nameof(bytes));
         }
 
-        var detector = new CharsetDetector();
-        detector.Feed(bytes, 0, bytes.Length);
-        return detector.DataEnd();
+        return DetectFromBytes(bytes.AsSpan());
     }
 
     /// <summary>
@@ -161,10 +173,8 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
         {
             throw new ArgumentException($"{nameof(len)} is greater than the number of bytes from {nameof(offset)} to the end of the array.");
         }
-
-        var detector = new CharsetDetector();
-        detector.Feed(bytes, offset, len);
-        return detector.DataEnd();
+
+        return DetectFromBytes(bytes.AsSpan(offset, len));
     }
 
     /// <summary>
@@ -291,7 +301,7 @@ private static async Task ReadStreamAsync(Stream stream, long? maxBytes, Charset
 
     private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead)
     {
-        detector.Feed(buff, 0, read);
+        detector.Feed(buff.AsSpan(0, read));
 
         if (maxBytes == null)
         {
@@ -401,37 +411,37 @@ private static FileStream OpenFile(string filePath)
             FileShare.ReadWrite);
     }
 
-    protected virtual void Feed(byte[] buf, int offset, int len)
+    protected virtual void Feed(ReadOnlySpan<byte> buf)
     {
         if (_done)
         {
             return;
         }
 
-        if (len > 0)
+        if (buf.Length > 0)
             _gotData = true;
 
         // If the data starts with BOM, we know it is UTF
         if (_start)
         {
             _start = false;
-            _done = IsStartsWithBom(buf, offset, len);
+            _done = IsStartsWithBom(buf);
             if (_done)
                 return;
         }
 
-        FindInputState(buf, offset, len);
+        FindInputState(buf);
         foreach (var prober in CharsetProbers)
         {
-            _done = RunProber(buf, offset, len, prober);
+            _done = RunProber(buf, prober);
             if (_done)
                 return;
         }
     }
 
-    private bool IsStartsWithBom(byte[] buf, int offset, int len)
+    private bool IsStartsWithBom(ReadOnlySpan<byte> buf)
     {
-        var bomSet = FindCharSetByBom(buf, offset, len);
+        var bomSet = FindCharSetByBom(buf);
         if (bomSet != null)
         {
             _detectionDetail = new DetectionDetail(bomSet, 1.0f)
@@ -443,9 +453,9 @@ private bool IsStartsWithBom(byte[] buf, int offset, int len)
         return false;
     }
 
-    private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber)
+    private bool RunProber(ReadOnlySpan<byte> buf, CharsetProber charsetProber)
     {
-        var probingState = charsetProber.HandleData(buf, offset, len);
+        var probingState = charsetProber.HandleData(buf);
         if (probingState == ProbingState.FoundIt)
         {
             _detectionDetail = new DetectionDetail(charsetProber);
@@ -454,9 +464,9 @@ private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetPro
         return false;
     }
 
-    private void FindInputState(byte[] buf, int offset, int len)
+    private void FindInputState(ReadOnlySpan<byte> buf)
     {
-        for (int i = offset; i < len; i++)
+        for (int i = 0; i < buf.Length; i++)
         {
             // other than 0xa0, if every other character is ascii, the page is ascii
             if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
@@ -485,59 +495,59 @@ private void FindInputState(byte[] buf, int offset, int len)
         }
     }
 
-    private static string FindCharSetByBom(byte[] buf, int offset, int len)
+    private static string FindCharSetByBom(ReadOnlySpan<byte> buf)
     {
-        if (len < 2)
+        if (buf.Length < 2)
             return null;
 
-        var buf0 = buf[offset + 0];
-        var buf1 = buf[offset + 1];
+        var buf0 = buf[0];
+        var buf1 = buf[1];
 
         if (buf0 == 0xFE && buf1 == 0xFF)
         {
             // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
-            return len > 3
-                   && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
+            return buf.Length > 3
+                   && buf[2] == 0x00 && buf[3] == 0x00
                 ? CodepageName.X_ISO_10646_UCS_4_3412
                 : CodepageName.UTF16_BE;
         }
 
         if (buf0 == 0xFF && buf1 == 0xFE)
         {
-            return len > 3
-                   && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
+            return buf.Length > 3
+                   && buf[2] == 0x00 && buf[3] == 0x00
                 ? CodepageName.UTF32_LE
                 : CodepageName.UTF16_LE;
         }
 
-        if (len < 3)
+        if (buf.Length < 3)
             return null;
 
-        if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
+        if (buf0 == 0xEF && buf1 == 0xBB && buf[2] == 0xBF)
             return CodepageName.UTF8;
 
-        if (len < 4)
+        if (buf.Length < 4)
             return null;
 
         //Here, because anyway further more than 3 positions are checked.
         if (buf0 == 0x00 && buf1 == 0x00)
         {
-            if (buf[offset + 2] == 0xFE && buf[offset + 3] == 0xFF)
+            if (buf[2] == 0xFE && buf[3] == 0xFF)
                 return CodepageName.UTF32_BE;
 
             // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
-            if (buf[offset + 2] == 0xFF && buf[offset + 3] == 0xFE)
+            if (buf[2] == 0xFF && buf[3] == 0xFE)
                 return CodepageName.X_ISO_10646_UCS_4_2143;
         }
 
         // Detect utf-7 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
-        if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
-            if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
+        if (buf0 == 0x2B && buf1 == 0x2F && buf[2] == 0x76)
+            if (buf[3] == 0x38 || buf[3] == 0x39 || buf[3] == 0x2B || buf[3] == 0x2F)
                 return CodepageName.UTF7;
 
         // Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
         // TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
-        if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
+        if (buf0 == 0x84 && buf1 == 0x31 && buf[2] == 0x95 && buf[3] == 0x33)
             return CodepageName.GB18030;
 
         return null;

diff --git a/src/Core/Analyzers/CharDistributionAnalyser.cs b/src/Core/Analyzers/CharDistributionAnalyser.cs
@@ -35,6 +35,8 @@
  *
  * ***** END LICENSE BLOCK ***** */
 
+using System;
+
 namespace UtfUnknown.Core.Analyzers;
 
 /// <summary>
@@ -77,20 +79,18 @@ public CharDistributionAnalyser()
     /// This allow multiple encoding of a language to share one frequency table
     /// </remarks>
     /// <param name="buf">A <see cref="System.Byte"/></param>
-    /// <param name="offset"></param>
     /// <returns></returns>
-    public abstract int GetOrder(byte[] buf, int offset);
+    public abstract int GetOrder(ReadOnlySpan<byte> buf);
 
     /// <summary>
     /// Feed a character with known length
     /// </summary>
     /// <param name="buf">A <see cref="System.Byte"/></param>
-    /// <param name="offset">buf offset</param>
     /// <param name="charLen">1 of 2 char length?</param>
-    public void HandleOneChar(byte[] buf, int offset, int charLen)
+    public void HandleOneChar(ReadOnlySpan<byte> buf, int charLen)
     {
         //we only care about 2-bytes character in our distribution analysis
-        int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
+        int order = (charLen == 2) ? GetOrder(buf) : -1;
         if (order >= 0)
         {
             totalChars++;
@@ -136,4 +136,4 @@ public bool GotEnoughData()
     {
         return totalChars > ENOUGH_DATA_THRESHOLD;
     }
-}
+}
diff --git a/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs
@@ -1,3 +1,5 @@
+using System;
+
 namespace UtfUnknown.Core.Analyzers.Chinese;
 
 public class BIG5DistributionAnalyser : CharDistributionAnalyser
@@ -914,13 +916,13 @@ public BIG5DistributionAnalyser()
     ///  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
     /// no validation needed here. State machine has done that
     /// </summary>
-    public override int GetOrder(byte[] buf, int offset)
+    public override int GetOrder(ReadOnlySpan<byte> buf)
     {
-        if (buf[offset] >= 0xA4) {
-            if (buf[offset+1] >= 0xA1)
-                return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
+        if (buf[0] >= 0xA4) {
+            if (buf[1] >= 0xA1)
+                return 157 * (buf[0] - 0xA4) + buf[1] - 0xA1 + 63;
             else
-                return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
+                return 157 * (buf[0] - 0xA4) + buf[1] - 0x40;
         } else {
             return -1;
         }

diff --git a/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs
@@ -1,3 +1,5 @@
+using System;
+
 namespace UtfUnknown.Core.Analyzers.Chinese;
 
 public class EUCTWDistributionAnalyser : CharDistributionAnalyser
@@ -417,11 +419,11 @@ public EUCTWDistributionAnalyser()
     ///  second byte range: 0xa1 -- 0xfe
     /// no validation needed here. State machine has done that
     /// </summary>
-    public override int GetOrder(byte[] buf, int offset)
+    public override int GetOrder(ReadOnlySpan<byte> buf)
     {
-        if (buf[offset] >= 0xC4)
-            return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
+        if (buf[0] >= 0xC4)
+            return 94 * (buf[0] - 0xC4) + buf[1] - 0xA1;
         else
             return -1;
     }
-}
+}
diff --git a/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs
@@ -1,3 +1,5 @@
+using System;
+
 namespace UtfUnknown.Core.Analyzers.Chinese;
 
 public class GB18030DistributionAnalyser : CharDistributionAnalyser
@@ -463,10 +465,10 @@ public GB18030DistributionAnalyser() : base()
     /// no validation needed here. State machine has done that
     /// </summary>
     /// <returns></returns>
-    public override int GetOrder(byte[] buf, int offset)
+    public override int GetOrder(ReadOnlySpan<byte> buf)
     {
-        if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
-            return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
+        if (buf[0] >= 0xB0 && buf[1] >= 0xA1)
+            return 94 * (buf[0] - 0xb0) + buf[1] - 0xA1;
         else
             return -1;
     }

diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs
@@ -1,12 +1,14 @@
+using System;
+
 namespace UtfUnknown.Core.Analyzers.Japanese;
 
 public class EUCJPContextAnalyser : JapaneseContextAnalyser
 {
     private const byte HIRAGANA_FIRST_BYTE = 0xA4;
 
-    protected override int GetOrder(byte[] buf, int offset, out int charLen)
+    protected override int GetOrder(ReadOnlySpan<byte> buf, out int charLen)
     {
-        byte high = buf[offset];
+        byte high = buf[0];
 
         //find out current char's byte length
         if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
@@ -18,21 +20,21 @@ protected override int GetOrder(byte[] buf, int offset, out int charLen)
 
         // return its order if it is hiragana
         if (high == HIRAGANA_FIRST_BYTE) {
-            byte low = buf[offset+1];
+            byte low = buf[1];
             if (low >= 0xA1 && low <= 0xF3)
                 return low - 0xA1;
         }
         return -1;
     }
 
-    protected override int GetOrder(byte[] buf, int offset)
+    protected override int GetOrder(ReadOnlySpan<byte> buf)
     {
         // We are only interested in Hiragana
-        if (buf[offset] == HIRAGANA_FIRST_BYTE) {
-            byte low = buf[offset+1];
+        if (buf[0] == HIRAGANA_FIRST_BYTE) {
+            byte low = buf[1];
             if (low >= 0xA1 && low <= 0xF3)
                 return low - 0xA1;
         }
         return -1;
     }
-}
+}
diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs
@@ -1,3 +1,5 @@
+using System;
+
 namespace UtfUnknown.Core.Analyzers.Japanese;
 
 public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
@@ -7,10 +9,10 @@ public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
     ///  second byte range: 0xa1 -- 0xfe
     /// no validation needed here. State machine has done that
     /// </summary>
-    public override int GetOrder(byte[] buf, int offset)
+    public override int GetOrder(ReadOnlySpan<byte> buf)
     {
-        if (buf[offset] >= 0xA0)
-            return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
+        if (buf[0] >= 0xA0)
+            return 94 * (buf[0] - 0xA1) + buf[1] - 0xA1;
         else
             return -1;
     }