From c46d4d5ac3f067593cc26337ae7d1276b6d9a6b0 Mon Sep 17 00:00:00 2001 From: Harnel Date: Mon, 29 Jun 2026 16:57:14 +0900 Subject: [PATCH 1/5] Support detecting encoding from ReadOnlySpan --- src/CharsetDetector.cs | 80 +++++++++++-------- .../Analyzers/CharDistributionAnalyser.cs | 8 +- .../Chinese/BIG5DistributionAnalyser.cs | 4 +- .../Chinese/EUCTWDistributionAnalyser.cs | 6 +- .../Chinese/GB18030DistributionAnalyser.cs | 4 +- .../Japanese/EUCJPContextAnalyser.cs | 8 +- .../Japanese/EUCJPDistributionAnalyser.cs | 4 +- .../Japanese/JapaneseContextAnalyser.cs | 10 ++- .../MultiByte/Japanese/SJISContextAnalyser.cs | 10 ++- .../Japanese/SJISDistributionAnalyser.cs | 4 +- .../Korean/EUCKRDistributionAnalyser.cs | 6 +- src/Core/Probers/CharsetProber.cs | 52 ++++++++---- src/Core/Probers/EscCharsetProber.cs | 7 +- src/Core/Probers/HebrewProber.cs | 7 +- src/Core/Probers/Latin1Prober.cs | 5 +- src/Core/Probers/MBCSGroupProber.cs | 11 +-- .../Probers/MultiByte/Chinese/Big5Prober.cs | 11 +-- .../Probers/MultiByte/Chinese/EUCTWProber.cs | 9 ++- .../MultiByte/Chinese/GB18030Prober.cs | 11 +-- .../Probers/MultiByte/Japanese/EUCJPProber.cs | 11 +-- .../Probers/MultiByte/Japanese/SJISProber.cs | 11 +-- .../Probers/MultiByte/Korean/CP949Prober.cs | 11 +-- .../Probers/MultiByte/Korean/EUCKRProber.cs | 11 +-- src/Core/Probers/MultiByte/UTF8Prober.cs | 7 +- src/Core/Probers/SBCSGroupProber.cs | 7 +- src/Core/Probers/SingleByteCharSetProber.cs | 6 +- src/UTF-unknown.csproj | 1 + 27 files changed, 193 insertions(+), 129 deletions(-) diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs index 5fea071..5e2fb2e 100644 --- a/src/CharsetDetector.cs +++ b/src/CharsetDetector.cs @@ -117,6 +117,20 @@ private CharsetDetector() _lastChar = 0x00; } + /// + /// Detect the character encoding of these bytes. + /// It searches for BOM from the start of the span. + /// Slice the span before passing it if only a range should be inspected. + /// + /// The bytes containing the text + /// + public static DetectionResult DetectFromBytes(ReadOnlySpan bytes) + { + var detector = new CharsetDetector(); + detector.Feed(bytes); + return detector.DataEnd(); + } + /// /// Detect the character encoding form this byte array. /// It searches for BOM from bytes[0]. @@ -130,9 +144,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes) throw new ArgumentNullException(nameof(bytes)); } - var detector = new CharsetDetector(); - detector.Feed(bytes, 0, bytes.Length); - return detector.DataEnd(); + return DetectFromBytes(bytes.AsSpan()); } /// @@ -161,10 +173,8 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len) { throw new ArgumentException($"{nameof(len)} is greater than the number of bytes from {nameof(offset)} to the end of the array."); } - - var detector = new CharsetDetector(); - detector.Feed(bytes, offset, len); - return detector.DataEnd(); + + return DetectFromBytes(bytes.AsSpan(offset, len)); } /// @@ -291,7 +301,7 @@ private static async Task ReadStreamAsync(Stream stream, long? maxBytes, Charset private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead) { - detector.Feed(buff, 0, read); + detector.Feed(buff.AsSpan(0, read)); if (maxBytes == null) { @@ -401,37 +411,37 @@ private static FileStream OpenFile(string filePath) FileShare.ReadWrite); } - protected virtual void Feed(byte[] buf, int offset, int len) + protected virtual void Feed(ReadOnlySpan buf) { if (_done) { return; } - if (len > 0) + if (buf.Length > 0) _gotData = true; // If the data starts with BOM, we know it is UTF if (_start) { _start = false; - _done = IsStartsWithBom(buf, offset, len); + _done = IsStartsWithBom(buf); if (_done) return; } - FindInputState(buf, offset, len); + FindInputState(buf); foreach (var prober in CharsetProbers) { - _done = RunProber(buf, offset, len, prober); + _done = RunProber(buf, prober); if (_done) return; } } - private bool IsStartsWithBom(byte[] buf, int offset, int len) + private bool IsStartsWithBom(ReadOnlySpan buf) { - var bomSet = FindCharSetByBom(buf, offset, len); + var bomSet = FindCharSetByBom(buf); if (bomSet != null) { _detectionDetail = new DetectionDetail(bomSet, 1.0f) @@ -443,9 +453,9 @@ private bool IsStartsWithBom(byte[] buf, int offset, int len) return false; } - private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber) + private bool RunProber(ReadOnlySpan buf, CharsetProber charsetProber) { - var probingState = charsetProber.HandleData(buf, offset, len); + var probingState = charsetProber.HandleData(buf); if (probingState == ProbingState.FoundIt) { _detectionDetail = new DetectionDetail(charsetProber); @@ -454,9 +464,9 @@ private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetPro return false; } - private void FindInputState(byte[] buf, int offset, int len) + private void FindInputState(ReadOnlySpan buf) { - for (int i = offset; i < len; i++) + for (int i = 0; i < buf.Length; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) @@ -485,59 +495,59 @@ private void FindInputState(byte[] buf, int offset, int len) } } - private static string FindCharSetByBom(byte[] buf, int offset, int len) + private static string FindCharSetByBom(ReadOnlySpan buf) { - if (len < 2) + if (buf.Length < 2) return null; - var buf0 = buf[offset + 0]; - var buf1 = buf[offset + 1]; + var buf0 = buf[0]; + var buf1 = buf[1]; if (buf0 == 0xFE && buf1 == 0xFF) { // FE FF 00 00 UCS-4, unusual octet order BOM (3412) - return len > 3 - && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00 + return buf.Length > 3 + && buf[2] == 0x00 && buf[3] == 0x00 ? CodepageName.X_ISO_10646_UCS_4_3412 : CodepageName.UTF16_BE; } if (buf0 == 0xFF && buf1 == 0xFE) { - return len > 3 - && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00 + return buf.Length > 3 + && buf[2] == 0x00 && buf[3] == 0x00 ? CodepageName.UTF32_LE : CodepageName.UTF16_LE; } - if (len < 3) + if (buf.Length < 3) return null; - if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF) + if (buf0 == 0xEF && buf1 == 0xBB && buf[2] == 0xBF) return CodepageName.UTF8; - if (len < 4) + if (buf.Length < 4) return null; //Here, because anyway further more than 3 positions are checked. if (buf0 == 0x00 && buf1 == 0x00) { - if (buf[offset + 2] == 0xFE && buf[offset + 3] == 0xFF) + if (buf[2] == 0xFE && buf[3] == 0xFF) return CodepageName.UTF32_BE; // 00 00 FF FE UCS-4, unusual octet order BOM (2143) - if (buf[offset + 2] == 0xFF && buf[offset + 3] == 0xFE) + if (buf[2] == 0xFF && buf[3] == 0xFE) return CodepageName.X_ISO_10646_UCS_4_2143; } // Detect utf-7 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark) - if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76) - if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F) + if (buf0 == 0x2B && buf1 == 0x2F && buf[2] == 0x76) + if (buf[3] == 0x38 || buf[3] == 0x39 || buf[3] == 0x2B || buf[3] == 0x2F) return CodepageName.UTF7; // Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark) // TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug? - if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33) + if (buf0 == 0x84 && buf1 == 0x31 && buf[2] == 0x95 && buf[3] == 0x33) return CodepageName.GB18030; return null; diff --git a/src/Core/Analyzers/CharDistributionAnalyser.cs b/src/Core/Analyzers/CharDistributionAnalyser.cs index b353eb2..d29158b 100644 --- a/src/Core/Analyzers/CharDistributionAnalyser.cs +++ b/src/Core/Analyzers/CharDistributionAnalyser.cs @@ -35,6 +35,8 @@ * * ***** END LICENSE BLOCK ***** */ +using System; + namespace UtfUnknown.Core.Analyzers; /// @@ -79,7 +81,7 @@ public CharDistributionAnalyser() /// A /// /// - public abstract int GetOrder(byte[] buf, int offset); + public abstract int GetOrder(ReadOnlySpan buf, int offset); /// /// Feed a character with known length @@ -87,7 +89,7 @@ public CharDistributionAnalyser() /// A /// buf offset /// 1 of 2 char length? - public void HandleOneChar(byte[] buf, int offset, int charLen) + public void HandleOneChar(ReadOnlySpan buf, int offset, int charLen) { //we only care about 2-bytes character in our distribution analysis int order = (charLen == 2) ? GetOrder(buf, offset) : -1; @@ -136,4 +138,4 @@ public bool GotEnoughData() { return totalChars > ENOUGH_DATA_THRESHOLD; } -} \ No newline at end of file +} diff --git a/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs index 636bfa7..b2fdbc3 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs @@ -1,3 +1,5 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Chinese; public class BIG5DistributionAnalyser : CharDistributionAnalyser @@ -914,7 +916,7 @@ public BIG5DistributionAnalyser() /// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { if (buf[offset] >= 0xA4) { if (buf[offset+1] >= 0xA1) diff --git a/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs index c840ad0..77fb052 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs @@ -1,3 +1,5 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Chinese; public class EUCTWDistributionAnalyser : CharDistributionAnalyser @@ -417,11 +419,11 @@ public EUCTWDistributionAnalyser() /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { if (buf[offset] >= 0xC4) return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1; else return -1; } -} \ No newline at end of file +} diff --git a/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs index ae49f3f..501ebd6 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs @@ -1,3 +1,5 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Chinese; public class GB18030DistributionAnalyser : CharDistributionAnalyser @@ -463,7 +465,7 @@ public GB18030DistributionAnalyser() : base() /// no validation needed here. State machine has done that /// /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1) return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1; diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs index 83bdaa7..8c934a8 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs @@ -1,10 +1,12 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Japanese; public class EUCJPContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0xA4; - protected override int GetOrder(byte[] buf, int offset, out int charLen) + protected override int GetOrder(ReadOnlySpan buf, int offset, out int charLen) { byte high = buf[offset]; @@ -25,7 +27,7 @@ protected override int GetOrder(byte[] buf, int offset, out int charLen) return -1; } - protected override int GetOrder(byte[] buf, int offset) + protected override int GetOrder(ReadOnlySpan buf, int offset) { // We are only interested in Hiragana if (buf[offset] == HIRAGANA_FIRST_BYTE) { @@ -35,4 +37,4 @@ protected override int GetOrder(byte[] buf, int offset) } return -1; } -} \ No newline at end of file +} diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs index fc624f9..4402ec1 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs @@ -1,3 +1,5 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Japanese; public class EUCJPDistributionAnalyser : SJISDistributionAnalyser @@ -7,7 +9,7 @@ public class EUCJPDistributionAnalyser : SJISDistributionAnalyser /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { if (buf[offset] >= 0xA0) return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1; diff --git a/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs index dd47bad..4691ce0 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs @@ -36,6 +36,8 @@ * * ***** END LICENSE BLOCK ***** */ +using System; + namespace UtfUnknown.Core.Analyzers.Japanese; public abstract class JapaneseContextAnalyser @@ -165,7 +167,7 @@ public float GetConfidence() return DONT_KNOW; } - public void HandleData(byte[] buf, int offset, int len) + public void HandleData(ReadOnlySpan buf, int offset, int len) { int max = offset + len; @@ -199,7 +201,7 @@ public void HandleData(byte[] buf, int offset, int len) } } - public void HandleOneChar(byte[] buf, int offset, int charLen) + public void HandleOneChar(ReadOnlySpan buf, int offset, int charLen) { if (totalRel > MAX_REL_THRESHOLD) done = true; @@ -227,9 +229,9 @@ public void Reset() } } - protected abstract int GetOrder(byte[] buf, int offset, out int charLen); + protected abstract int GetOrder(ReadOnlySpan buf, int offset, out int charLen); - protected abstract int GetOrder(byte[] buf, int offset); + protected abstract int GetOrder(ReadOnlySpan buf, int offset); public bool GotEnoughData() { diff --git a/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs index df96134..ecc6c18 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs @@ -1,10 +1,12 @@ -namespace UtfUnknown.Core.Analyzers.Japanese; +using System; + +namespace UtfUnknown.Core.Analyzers.Japanese; public class SJISContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0x82; - protected override int GetOrder(byte[] buf, int offset, out int charLen) + protected override int GetOrder(ReadOnlySpan buf, int offset, out int charLen) { //find out current char's byte length if (buf[offset] >= 0x81 && buf[offset] <= 0x9F @@ -22,7 +24,7 @@ protected override int GetOrder(byte[] buf, int offset, out int charLen) return -1; } - protected override int GetOrder(byte[] buf, int offset) + protected override int GetOrder(ReadOnlySpan buf, int offset) { // We are only interested in Hiragana if (buf[offset] == HIRAGANA_FIRST_BYTE) { @@ -32,4 +34,4 @@ protected override int GetOrder(byte[] buf, int offset) } return -1; } -} \ No newline at end of file +} diff --git a/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs index 83f4068..2d3d522 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs @@ -1,3 +1,5 @@ +using System; + namespace UtfUnknown.Core.Analyzers.Japanese; public class SJISDistributionAnalyser : CharDistributionAnalyser @@ -558,7 +560,7 @@ public SJISDistributionAnalyser() /// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe /// no validation needed here. State machine has done that /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { int order; diff --git a/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs index 736c8b4..477ffc7 100644 --- a/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs @@ -1,4 +1,6 @@ -namespace UtfUnknown.Core.Analyzers.Korean; +using System; + +namespace UtfUnknown.Core.Analyzers.Korean; public class EUCKRDistributionAnalyser : CharDistributionAnalyser { @@ -583,7 +585,7 @@ public EUCKRDistributionAnalyser() /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(byte[] buf, int offset) + public override int GetOrder(ReadOnlySpan buf, int offset) { if (buf[offset] >= 0xB0) return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1; diff --git a/src/Core/Probers/CharsetProber.cs b/src/Core/Probers/CharsetProber.cs index f30d2c8..2b7f28d 100644 --- a/src/Core/Probers/CharsetProber.cs +++ b/src/Core/Probers/CharsetProber.cs @@ -36,8 +36,10 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.IO; using System.Text; +using System.Buffers; namespace UtfUnknown.Core.Probers; @@ -65,7 +67,7 @@ public abstract class CharsetProber /// /// A /// - public abstract ProbingState HandleData(byte[] buf, int offset, int len); + public abstract ProbingState HandleData(ReadOnlySpan buf); /// /// Reset prober state @@ -98,16 +100,16 @@ public virtual string DumpStatus() /// /// /// filtered buffer - protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len) + protected static byte[] FilterWithoutEnglishLetters(ReadOnlySpan buf) { byte[] result; using (MemoryStream ms = new MemoryStream(buf.Length)) { bool meetMSB = false; - int max = offset + len; - int prev = offset; - int cur = offset; + int max = buf.Length; + int prev = 0; + int cur = 0; while (cur < max) { @@ -121,7 +123,7 @@ protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int { if (meetMSB && cur > prev) { - ms.Write(buf, prev, cur - prev); + WriteSpanToStream(ms, buf.Slice(prev, cur - prev)); ms.WriteByte(SPACE); meetMSB = false; } @@ -131,7 +133,7 @@ protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int } if (meetMSB && cur > prev) - ms.Write(buf, prev, cur - prev); + WriteSpanToStream(ms, buf.Slice(prev, cur - prev)); ms.SetLength(ms.Position); result = ms.ToArray(); } @@ -144,21 +146,19 @@ protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int /// both English characters and upper ASCII characters. /// /// a filtered copy of the input buffer - protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len) + protected static byte[] FilterWithEnglishLetters(ReadOnlySpan buf) { byte[] result; using (MemoryStream ms = new MemoryStream(buf.Length)) { - bool inTag = false; - int max = offset + len; - int prev = offset; - int cur = offset; + int max = buf.Length; + int prev = 0; + int cur = 0; while (cur < max) { - byte b = buf[cur]; if (b == GREATER_THAN) @@ -172,7 +172,7 @@ protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len { if (cur > prev && !inTag) { - ms.Write(buf, prev, cur - prev); + WriteSpanToStream(ms, buf.Slice(prev, cur - prev)); ms.WriteByte(SPACE); } prev = cur + 1; @@ -183,10 +183,30 @@ protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len // If the current segment contains more than just a symbol // and it is not inside a tag then keep it. if (!inTag && cur > prev) - ms.Write(buf, prev, cur - prev); + WriteSpanToStream(ms, buf.Slice(prev, cur - prev)); ms.SetLength(ms.Position); result = ms.ToArray(); } return result; } -} \ No newline at end of file + + private static void WriteSpanToStream(MemoryStream stream, ReadOnlySpan buffer) + { +#if NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER + stream.Write(buffer); +#else + byte[] rent = ArrayPool.Shared.Rent(buffer.Length); + + try + { + buffer.CopyTo(rent); + + stream.Write(rent, 0, buffer.Length); + } + finally + { + ArrayPool.Shared.Return(rent); + } +#endif + } +} diff --git a/src/Core/Probers/EscCharsetProber.cs b/src/Core/Probers/EscCharsetProber.cs index 51796b1..2c5dda6 100644 --- a/src/Core/Probers/EscCharsetProber.cs +++ b/src/Core/Probers/EscCharsetProber.cs @@ -35,6 +35,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Models; @@ -70,11 +71,11 @@ public override void Reset() detectedCharset = null; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max && state == ProbingState.Detecting; i++) { + for (int i = 0; i < max && state == ProbingState.Detecting; i++) { for (int j = activeSM - 1; j >= 0; j--) { // byte is feed to all active state machine int codingState = codingSM[j].NextState(buf[i]); diff --git a/src/Core/Probers/HebrewProber.cs b/src/Core/Probers/HebrewProber.cs index d6c4da1..646f345 100644 --- a/src/Core/Probers/HebrewProber.cs +++ b/src/Core/Probers/HebrewProber.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; /* @@ -209,15 +210,15 @@ public void SetModelProbers(CharsetProber logical, CharsetProber visual) * The input buffer should not contain any white spaces that are not (' ') * or any low-ascii punctuation marks. */ - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { // Both model probers say it's not them. No reason to continue. if (GetState() == ProbingState.NotMe) return ProbingState.NotMe; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) { + for (int i = 0; i < max; i++) { byte b = buf[i]; diff --git a/src/Core/Probers/Latin1Prober.cs b/src/Core/Probers/Latin1Prober.cs index 267a2c3..180327e 100644 --- a/src/Core/Probers/Latin1Prober.cs +++ b/src/Core/Probers/Latin1Prober.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; namespace UtfUnknown.Core.Probers; @@ -130,9 +131,9 @@ public override void Reset() freqCounter[i] = 0; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - byte[] newbuf = FilterWithEnglishLetters(buf, offset, len); + byte[] newbuf = FilterWithEnglishLetters(buf); byte charClass, freq; for (int i = 0; i < newbuf.Length; i++) diff --git a/src/Core/Probers/MBCSGroupProber.cs b/src/Core/Probers/MBCSGroupProber.cs index 1e5c5f9..7668e71 100644 --- a/src/Core/Probers/MBCSGroupProber.cs +++ b/src/Core/Probers/MBCSGroupProber.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Probers.MultiByte; @@ -106,16 +107,16 @@ public override void Reset() state = ProbingState.Detecting; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { // do filtering to reduce load to probers - byte[] highbyteBuf = new byte[len]; + byte[] highbyteBuf = new byte[buf.Length]; int hptr = 0; //assume previous is not ascii, it will do no harm except add some noise bool keepNext = true; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { if ((buf[i] & 0x80) != 0) { @@ -137,7 +138,7 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) { if (isActive[i]) { - var st = probers[i].HandleData(highbyteBuf, 0, hptr); + var st = probers[i].HandleData(highbyteBuf.AsSpan(0, hptr)); if (st == ProbingState.FoundIt) { bestGuess = i; diff --git a/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs b/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs index 22de251..879c99b 100644 --- a/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs +++ b/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Chinese; @@ -58,11 +59,11 @@ public Big5Prober() Reset(); } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { var codingState = codingSM.NextState(buf[i]); if (codingState == StateMachineModel.ERROR) @@ -78,9 +79,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } else diff --git a/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs b/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs index 9e10dea..cd5482e 100644 --- a/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs +++ b/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Chinese; @@ -57,10 +58,10 @@ public EUCTWProber() Reset(); } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { int codingState; - int max = offset + len; + int max = buf.Length; for (int i = 0; i < max; i++) { @@ -80,9 +81,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } else diff --git a/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs b/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs index 3a4c79d..73beebb 100644 --- a/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs +++ b/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Chinese; @@ -64,11 +65,11 @@ public override string GetCharsetName() return CodepageName.GB18030; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { var codingState = codingSM.NextState(buf[i]); @@ -87,9 +88,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; analyser.HandleOneChar(lastChar, 0, charLen); } else diff --git a/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs b/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs index 7a21740..b983344 100644 --- a/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs +++ b/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs @@ -35,6 +35,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Japanese; @@ -63,12 +64,12 @@ public override string GetCharsetName() return CodepageName.EUC_JP; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { int codingState; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { codingState = codingSM.NextState(buf[i]); if (codingState == StateMachineModel.ERROR) @@ -84,9 +85,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; contextAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } diff --git a/src/Core/Probers/MultiByte/Japanese/SJISProber.cs b/src/Core/Probers/MultiByte/Japanese/SJISProber.cs index a8ece9e..b8b96f0 100644 --- a/src/Core/Probers/MultiByte/Japanese/SJISProber.cs +++ b/src/Core/Probers/MultiByte/Japanese/SJISProber.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Japanese; @@ -70,12 +71,12 @@ public override string GetCharsetName() return CodepageName.SHIFT_JIS; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { int codingState; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { codingState = codingSM.NextState(buf[i]); if (codingState == StateMachineModel.ERROR) @@ -91,9 +92,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; contextAnalyser.HandleOneChar(lastChar, 2 - charLen, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } diff --git a/src/Core/Probers/MultiByte/Korean/CP949Prober.cs b/src/Core/Probers/MultiByte/Korean/CP949Prober.cs index 307cdee..2837b4d 100644 --- a/src/Core/Probers/MultiByte/Korean/CP949Prober.cs +++ b/src/Core/Probers/MultiByte/Korean/CP949Prober.cs @@ -35,6 +35,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Korean; @@ -63,12 +64,12 @@ public override string GetCharsetName() return CodepageName.CP949; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { int codingState; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { codingState = codingSM.NextState(buf[i]); if (codingState == StateMachineModel.ERROR) @@ -86,9 +87,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } else diff --git a/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs b/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs index 87028f8..0b3931e 100644 --- a/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs +++ b/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs @@ -35,6 +35,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Analyzers.Korean; @@ -61,12 +62,12 @@ public override string GetCharsetName() return CodepageName.EUC_KR; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { int codingState; - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { codingState = codingSM.NextState(buf[i]); if (codingState == StateMachineModel.ERROR) @@ -84,9 +85,9 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) if (codingState == StateMachineModel.START) { int charLen = codingSM.CurrentCharLen; - if (i == offset) + if (i == 0) { - lastChar[1] = buf[offset]; + lastChar[1] = buf[0]; distributionAnalyser.HandleOneChar(lastChar, 0, charLen); } else diff --git a/src/Core/Probers/MultiByte/UTF8Prober.cs b/src/Core/Probers/MultiByte/UTF8Prober.cs index 907bdc8..bd42b47 100644 --- a/src/Core/Probers/MultiByte/UTF8Prober.cs +++ b/src/Core/Probers/MultiByte/UTF8Prober.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; using UtfUnknown.Core.Models; @@ -68,11 +69,11 @@ public override void Reset() state = ProbingState.Detecting; } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { var codingState = codingSM.NextState(buf[i]); diff --git a/src/Core/Probers/SBCSGroupProber.cs b/src/Core/Probers/SBCSGroupProber.cs index 0284860..83c3770 100644 --- a/src/Core/Probers/SBCSGroupProber.cs +++ b/src/Core/Probers/SBCSGroupProber.cs @@ -36,6 +36,7 @@ * * ***** END LICENSE BLOCK ***** */ +using System; using System.Text; #region using languages @@ -253,7 +254,7 @@ public SBCSGroupProber() Reset(); } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { // apply filter to original buffer, and we got new buffer back // depend on what script it is, we will feed them the new buffer @@ -262,7 +263,7 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) // of each prober since as of now, there are no probers here which // recognize languages with English characters. - byte[] newBuf = FilterWithoutEnglishLetters(buf, offset, len); + byte[] newBuf = FilterWithoutEnglishLetters(buf); if (newBuf.Length == 0) return state; // Nothing to see here, move on. @@ -271,7 +272,7 @@ public override ProbingState HandleData(byte[] buf, int offset, int len) { if (isActive[i]) { - ProbingState st = probers[i].HandleData(newBuf, 0, newBuf.Length); + ProbingState st = probers[i].HandleData(newBuf); if (st == ProbingState.FoundIt) { diff --git a/src/Core/Probers/SingleByteCharSetProber.cs b/src/Core/Probers/SingleByteCharSetProber.cs index 5b74d42..af9a08a 100644 --- a/src/Core/Probers/SingleByteCharSetProber.cs +++ b/src/Core/Probers/SingleByteCharSetProber.cs @@ -90,11 +90,11 @@ public SingleByteCharSetProber(SequenceModel model, bool reversed, Reset(); } - public override ProbingState HandleData(byte[] buf, int offset, int len) + public override ProbingState HandleData(ReadOnlySpan buf) { - int max = offset + len; + int max = buf.Length; - for (int i = offset; i < max; i++) + for (int i = 0; i < max; i++) { byte order = model.GetOrder(buf[i]); diff --git a/src/UTF-unknown.csproj b/src/UTF-unknown.csproj index 8b4c3ab..46b07a6 100644 --- a/src/UTF-unknown.csproj +++ b/src/UTF-unknown.csproj @@ -18,6 +18,7 @@ + From 0c9e607cd6893d5c17db28405d91e8c855c18e0b Mon Sep 17 00:00:00 2001 From: Harnel Date: Mon, 29 Jun 2026 17:11:24 +0900 Subject: [PATCH 2/5] Remove offset parameter from HandleOneChar and GetOrder --- src/Core/Analyzers/CharDistributionAnalyser.cs | 6 +++--- .../Chinese/BIG5DistributionAnalyser.cs | 10 +++++----- .../Chinese/EUCTWDistributionAnalyser.cs | 6 +++--- .../Chinese/GB18030DistributionAnalyser.cs | 6 +++--- .../MultiByte/Japanese/EUCJPContextAnalyser.cs | 12 ++++++------ .../Japanese/EUCJPDistributionAnalyser.cs | 6 +++--- .../Japanese/JapaneseContextAnalyser.cs | 10 +++++----- .../MultiByte/Japanese/SJISContextAnalyser.cs | 16 ++++++++-------- .../Japanese/SJISDistributionAnalyser.cs | 14 +++++++------- .../Korean/EUCKRDistributionAnalyser.cs | 6 +++--- src/Core/Probers/MultiByte/Chinese/Big5Prober.cs | 4 ++-- .../Probers/MultiByte/Chinese/EUCTWProber.cs | 4 ++-- .../Probers/MultiByte/Chinese/GB18030Prober.cs | 4 ++-- .../Probers/MultiByte/Japanese/EUCJPProber.cs | 8 ++++---- .../Probers/MultiByte/Japanese/SJISProber.cs | 10 +++++----- src/Core/Probers/MultiByte/Korean/CP949Prober.cs | 4 ++-- src/Core/Probers/MultiByte/Korean/EUCKRProber.cs | 4 ++-- 17 files changed, 65 insertions(+), 65 deletions(-) diff --git a/src/Core/Analyzers/CharDistributionAnalyser.cs b/src/Core/Analyzers/CharDistributionAnalyser.cs index d29158b..f54abac 100644 --- a/src/Core/Analyzers/CharDistributionAnalyser.cs +++ b/src/Core/Analyzers/CharDistributionAnalyser.cs @@ -81,7 +81,7 @@ public CharDistributionAnalyser() /// A /// /// - public abstract int GetOrder(ReadOnlySpan buf, int offset); + public abstract int GetOrder(ReadOnlySpan buf); /// /// Feed a character with known length @@ -89,10 +89,10 @@ public CharDistributionAnalyser() /// A /// buf offset /// 1 of 2 char length? - public void HandleOneChar(ReadOnlySpan buf, int offset, int charLen) + public void HandleOneChar(ReadOnlySpan buf, int charLen) { //we only care about 2-bytes character in our distribution analysis - int order = (charLen == 2) ? GetOrder(buf, offset) : -1; + int order = (charLen == 2) ? GetOrder(buf) : -1; if (order >= 0) { totalChars++; diff --git a/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs index b2fdbc3..62733a5 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs @@ -916,13 +916,13 @@ public BIG5DistributionAnalyser() /// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { - if (buf[offset] >= 0xA4) { - if (buf[offset+1] >= 0xA1) - return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63; + if (buf[0] >= 0xA4) { + if (buf[1] >= 0xA1) + return 157 * (buf[0] - 0xA4) + buf[1] - 0xA1 + 63; else - return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40; + return 157 * (buf[0] - 0xA4) + buf[1] - 0x40; } else { return -1; } diff --git a/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs index 77fb052..1648bf7 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/EUCTWDistributionAnalyser.cs @@ -419,10 +419,10 @@ public EUCTWDistributionAnalyser() /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { - if (buf[offset] >= 0xC4) - return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1; + if (buf[0] >= 0xC4) + return 94 * (buf[0] - 0xC4) + buf[1] - 0xA1; else return -1; } diff --git a/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs index 501ebd6..9ffc1fe 100644 --- a/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Chinese/GB18030DistributionAnalyser.cs @@ -465,10 +465,10 @@ public GB18030DistributionAnalyser() : base() /// no validation needed here. State machine has done that /// /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { - if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1) - return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1; + if (buf[0] >= 0xB0 && buf[1] >= 0xA1) + return 94 * (buf[0] - 0xb0) + buf[1] - 0xA1; else return -1; } diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs index 8c934a8..fbd636e 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs @@ -6,9 +6,9 @@ public class EUCJPContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0xA4; - protected override int GetOrder(ReadOnlySpan buf, int offset, out int charLen) + protected override int GetOrder(ReadOnlySpan buf, out int charLen) { - byte high = buf[offset]; + byte high = buf[0]; //find out current char's byte length if (high == 0x8E || high >= 0xA1 && high <= 0xFE) @@ -20,18 +20,18 @@ protected override int GetOrder(ReadOnlySpan buf, int offset, out int char // return its order if it is hiragana if (high == HIRAGANA_FIRST_BYTE) { - byte low = buf[offset+1]; + byte low = buf[1]; if (low >= 0xA1 && low <= 0xF3) return low - 0xA1; } return -1; } - protected override int GetOrder(ReadOnlySpan buf, int offset) + protected override int GetOrder(ReadOnlySpan buf) { // We are only interested in Hiragana - if (buf[offset] == HIRAGANA_FIRST_BYTE) { - byte low = buf[offset+1]; + if (buf[0] == HIRAGANA_FIRST_BYTE) { + byte low = buf[1]; if (low >= 0xA1 && low <= 0xF3) return low - 0xA1; } diff --git a/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs index 4402ec1..8700f2c 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/EUCJPDistributionAnalyser.cs @@ -9,10 +9,10 @@ public class EUCJPDistributionAnalyser : SJISDistributionAnalyser /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { - if (buf[offset] >= 0xA0) - return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1; + if (buf[0] >= 0xA0) + return 94 * (buf[0] - 0xA1) + buf[1] - 0xA1; else return -1; } diff --git a/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs index 4691ce0..91348eb 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/JapaneseContextAnalyser.cs @@ -182,7 +182,7 @@ public void HandleData(ReadOnlySpan buf, int offset, int len) // is complete, but since a character will not make much difference, // skipping it will simplify our logic and improve performance. for (int i = needToSkipCharNum+offset; i < max; ) { - int order = GetOrder(buf, i, out var charLen); + int order = GetOrder(buf.Slice(i), out var charLen); i += charLen; if (i > max) { needToSkipCharNum = i - max; @@ -201,7 +201,7 @@ public void HandleData(ReadOnlySpan buf, int offset, int len) } } - public void HandleOneChar(ReadOnlySpan buf, int offset, int charLen) + public void HandleOneChar(ReadOnlySpan buf, int charLen) { if (totalRel > MAX_REL_THRESHOLD) done = true; @@ -209,7 +209,7 @@ public void HandleOneChar(ReadOnlySpan buf, int offset, int charLen) return; // Only 2-bytes characters are of our interest - int order = (charLen == 2) ? GetOrder(buf, offset) : -1; + int order = (charLen == 2) ? GetOrder(buf) : -1; if (order != -1 && lastCharOrder != -1) { totalRel++; // count this sequence to its category counter @@ -229,9 +229,9 @@ public void Reset() } } - protected abstract int GetOrder(ReadOnlySpan buf, int offset, out int charLen); + protected abstract int GetOrder(ReadOnlySpan buf, out int charLen); - protected abstract int GetOrder(ReadOnlySpan buf, int offset); + protected abstract int GetOrder(ReadOnlySpan buf); public bool GotEnoughData() { diff --git a/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs index ecc6c18..b559d31 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/SJISContextAnalyser.cs @@ -6,29 +6,29 @@ public class SJISContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0x82; - protected override int GetOrder(ReadOnlySpan buf, int offset, out int charLen) + protected override int GetOrder(ReadOnlySpan buf, out int charLen) { //find out current char's byte length - if (buf[offset] >= 0x81 && buf[offset] <= 0x9F - || buf[offset] >= 0xe0 && buf[offset] <= 0xFC) + if (buf[0] >= 0x81 && buf[0] <= 0x9F + || buf[0] >= 0xe0 && buf[0] <= 0xFC) charLen = 2; else charLen = 1; // return its order if it is hiragana - if (buf[offset] == HIRAGANA_FIRST_BYTE) { - byte low = buf[offset+1]; + if (buf[0] == HIRAGANA_FIRST_BYTE) { + byte low = buf[1]; if (low >= 0x9F && low <= 0xF1) return low - 0x9F; } return -1; } - protected override int GetOrder(ReadOnlySpan buf, int offset) + protected override int GetOrder(ReadOnlySpan buf) { // We are only interested in Hiragana - if (buf[offset] == HIRAGANA_FIRST_BYTE) { - byte low = buf[offset+1]; + if (buf[0] == HIRAGANA_FIRST_BYTE) { + byte low = buf[1]; if (low >= 0x9F && low <= 0xF1) return low - 0x9F; } diff --git a/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs index 2d3d522..b1f2f41 100644 --- a/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Japanese/SJISDistributionAnalyser.cs @@ -560,19 +560,19 @@ public SJISDistributionAnalyser() /// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe /// no validation needed here. State machine has done that /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { int order; - if (buf[offset] >= 0x81 && buf[offset] <= 0x9F) - order = 188 * (buf[offset] - 0x81); - else if (buf[offset] >= 0xE0 && buf[offset] <= 0xEF) - order = 188 * (buf[offset] - 0xE0 + 31); + if (buf[0] >= 0x81 && buf[0] <= 0x9F) + order = 188 * (buf[0] - 0x81); + else if (buf[0] >= 0xE0 && buf[0] <= 0xEF) + order = 188 * (buf[0] - 0xE0 + 31); else return -1; - order += buf[offset+1] - 0x40; + order += buf[1] - 0x40; - if (buf[offset+1] > 0x7F) + if (buf[1] > 0x7F) order--; return order; } diff --git a/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs b/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs index 477ffc7..2146be5 100644 --- a/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs +++ b/src/Core/Analyzers/MultiByte/Korean/EUCKRDistributionAnalyser.cs @@ -585,10 +585,10 @@ public EUCKRDistributionAnalyser() /// second byte range: 0xa1 -- 0xfe /// no validation needed here. State machine has done that /// - public override int GetOrder(ReadOnlySpan buf, int offset) + public override int GetOrder(ReadOnlySpan buf) { - if (buf[offset] >= 0xB0) - return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1; + if (buf[0] >= 0xB0) + return 94 * (buf[0] - 0xB0) + buf[1] - 0xA1; else return -1; } diff --git a/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs b/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs index 879c99b..b4dcad9 100644 --- a/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs +++ b/src/Core/Probers/MultiByte/Chinese/Big5Prober.cs @@ -82,11 +82,11 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } diff --git a/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs b/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs index cd5482e..df17787 100644 --- a/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs +++ b/src/Core/Probers/MultiByte/Chinese/EUCTWProber.cs @@ -84,11 +84,11 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } diff --git a/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs b/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs index 73beebb..c76ae08 100644 --- a/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs +++ b/src/Core/Probers/MultiByte/Chinese/GB18030Prober.cs @@ -91,11 +91,11 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - analyser.HandleOneChar(lastChar, 0, charLen); + analyser.HandleOneChar(lastChar, charLen); } else { - analyser.HandleOneChar(buf, i - 1, charLen); + analyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } diff --git a/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs b/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs index b983344..2cfd10e 100644 --- a/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs +++ b/src/Core/Probers/MultiByte/Japanese/EUCJPProber.cs @@ -88,13 +88,13 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - contextAnalyser.HandleOneChar(lastChar, 0, charLen); - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + contextAnalyser.HandleOneChar(lastChar, charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - contextAnalyser.HandleOneChar(buf, i - 1, charLen); - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + contextAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } diff --git a/src/Core/Probers/MultiByte/Japanese/SJISProber.cs b/src/Core/Probers/MultiByte/Japanese/SJISProber.cs index b8b96f0..e1eeeaf 100644 --- a/src/Core/Probers/MultiByte/Japanese/SJISProber.cs +++ b/src/Core/Probers/MultiByte/Japanese/SJISProber.cs @@ -95,13 +95,13 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - contextAnalyser.HandleOneChar(lastChar, 2 - charLen, charLen); - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + contextAnalyser.HandleOneChar(lastChar.AsSpan(2 - charLen), charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - contextAnalyser.HandleOneChar(buf, i + 1 - charLen, charLen); - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + contextAnalyser.HandleOneChar(buf.Slice(i + 1 - charLen), charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } @@ -129,4 +129,4 @@ public override float GetConfidence(StringBuilder status = null) float distribCf = distributionAnalyser.GetConfidence(); return (contxtCf > distribCf ? contxtCf : distribCf); } -} \ No newline at end of file +} diff --git a/src/Core/Probers/MultiByte/Korean/CP949Prober.cs b/src/Core/Probers/MultiByte/Korean/CP949Prober.cs index 2837b4d..dae7c4e 100644 --- a/src/Core/Probers/MultiByte/Korean/CP949Prober.cs +++ b/src/Core/Probers/MultiByte/Korean/CP949Prober.cs @@ -90,11 +90,11 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } diff --git a/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs b/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs index 0b3931e..15941aa 100644 --- a/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs +++ b/src/Core/Probers/MultiByte/Korean/EUCKRProber.cs @@ -88,11 +88,11 @@ public override ProbingState HandleData(ReadOnlySpan buf) if (i == 0) { lastChar[1] = buf[0]; - distributionAnalyser.HandleOneChar(lastChar, 0, charLen); + distributionAnalyser.HandleOneChar(lastChar, charLen); } else { - distributionAnalyser.HandleOneChar(buf, i - 1, charLen); + distributionAnalyser.HandleOneChar(buf.Slice(i - 1), charLen); } } } From b02f712d48fb4484b1875cd516f541253979a7ae Mon Sep 17 00:00:00 2001 From: Harnel Date: Mon, 29 Jun 2026 16:57:48 +0900 Subject: [PATCH 3/5] Add test for DetectFromBytes(ReadOnlySpan) --- tests/CharsetDetectorTest.cs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/CharsetDetectorTest.cs b/tests/CharsetDetectorTest.cs index 43006f1..07c6764 100644 --- a/tests/CharsetDetectorTest.cs +++ b/tests/CharsetDetectorTest.cs @@ -4,6 +4,7 @@ // Rudi Pettazzi // +using System; using System.IO; using System.Text; using System.Threading.Tasks; @@ -193,6 +194,16 @@ public void TestBomUtf8() Assert.That(result.Detected.HasBOM, Is.True); } + [Test] + public void DetectFromReadOnlySpan() + { + ReadOnlySpan buf = new byte[] { 0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x21 }; + var result = CharsetDetector.DetectFromBytes(buf); + Assert.That(result.Detected.EncodingName, Is.EqualTo(CodepageName.UTF8)); + Assert.That(result.Detected.Confidence, Is.EqualTo(1.0f)); + Assert.That(result.Detected.HasBOM, Is.True); + } + [Test] public void Test2byteArrayBomUTF16_BE() { From b070eb6d67027fedaa5296bafe88993204d3dcc1 Mon Sep 17 00:00:00 2001 From: Harnel Date: Mon, 29 Jun 2026 17:22:31 +0900 Subject: [PATCH 4/5] Remove obsolete offset/len XML doc param tags --- src/Core/Analyzers/CharDistributionAnalyser.cs | 2 -- src/Core/Probers/CharsetProber.cs | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/Core/Analyzers/CharDistributionAnalyser.cs b/src/Core/Analyzers/CharDistributionAnalyser.cs index f54abac..3ce5eae 100644 --- a/src/Core/Analyzers/CharDistributionAnalyser.cs +++ b/src/Core/Analyzers/CharDistributionAnalyser.cs @@ -79,7 +79,6 @@ public CharDistributionAnalyser() /// This allow multiple encoding of a language to share one frequency table /// /// A - /// /// public abstract int GetOrder(ReadOnlySpan buf); @@ -87,7 +86,6 @@ public CharDistributionAnalyser() /// Feed a character with known length /// /// A - /// buf offset /// 1 of 2 char length? public void HandleOneChar(ReadOnlySpan buf, int charLen) { diff --git a/src/Core/Probers/CharsetProber.cs b/src/Core/Probers/CharsetProber.cs index 2b7f28d..1807fc8 100644 --- a/src/Core/Probers/CharsetProber.cs +++ b/src/Core/Probers/CharsetProber.cs @@ -62,8 +62,6 @@ public abstract class CharsetProber /// Feed data to the prober /// /// a buffer - /// offset into buffer - /// number of bytes available into buffer /// /// A /// From 28d360afa5bcc24b4941d7bd69ef5df78b358b85 Mon Sep 17 00:00:00 2001 From: Harnel Date: Tue, 30 Jun 2026 14:26:14 +0900 Subject: [PATCH 5/5] Update System.Memory to 4.6.5 for .NET Standard 2.0 --- src/UTF-unknown.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UTF-unknown.csproj b/src/UTF-unknown.csproj index 25f943f..07b3510 100644 --- a/src/UTF-unknown.csproj +++ b/src/UTF-unknown.csproj @@ -18,7 +18,7 @@ - +