Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 45 additions & 35 deletions src/CharsetDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,20 @@ private CharsetDetector()
_lastChar = 0x00;
}

/// <summary>
/// Detect the character encoding of these bytes.
/// It searches for BOM from the start of the span.
/// Slice the span before passing it if only a range should be inspected.
/// </summary>
/// <param name="bytes">The bytes containing the text</param>
/// <returns></returns>
public static DetectionResult DetectFromBytes(ReadOnlySpan<byte> bytes)
{
var detector = new CharsetDetector();
detector.Feed(bytes);
return detector.DataEnd();
}

/// <summary>
/// Detect the character encoding form this byte array.
/// It searches for BOM from bytes[0].
Expand All @@ -130,9 +144,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
throw new ArgumentNullException(nameof(bytes));
}

var detector = new CharsetDetector();
detector.Feed(bytes, 0, bytes.Length);
return detector.DataEnd();
return DetectFromBytes(bytes.AsSpan());
}

/// <summary>
Expand Down Expand Up @@ -161,10 +173,8 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
{
throw new ArgumentException($"{nameof(len)} is greater than the number of bytes from {nameof(offset)} to the end of the array.");
}

var detector = new CharsetDetector();
detector.Feed(bytes, offset, len);
return detector.DataEnd();

return DetectFromBytes(bytes.AsSpan(offset, len));
}

/// <summary>
Expand Down Expand Up @@ -291,7 +301,7 @@ private static async Task ReadStreamAsync(Stream stream, long? maxBytes, Charset

private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead)
{
detector.Feed(buff, 0, read);
detector.Feed(buff.AsSpan(0, read));

if (maxBytes == null)
{
Expand Down Expand Up @@ -401,37 +411,37 @@ private static FileStream OpenFile(string filePath)
FileShare.ReadWrite);
}

protected virtual void Feed(byte[] buf, int offset, int len)
protected virtual void Feed(ReadOnlySpan<byte> buf)
{
if (_done)
{
return;
}

if (len > 0)
if (buf.Length > 0)
_gotData = true;
Comment on lines +421 to 422

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds as a good suggestion.

We need a unittest to validate this issue/fix


// If the data starts with BOM, we know it is UTF
if (_start)
{
_start = false;
_done = IsStartsWithBom(buf, offset, len);
_done = IsStartsWithBom(buf);
if (_done)
return;
}

FindInputState(buf, offset, len);
FindInputState(buf);
foreach (var prober in CharsetProbers)
{
_done = RunProber(buf, offset, len, prober);
_done = RunProber(buf, prober);
if (_done)
return;
}
}

private bool IsStartsWithBom(byte[] buf, int offset, int len)
private bool IsStartsWithBom(ReadOnlySpan<byte> buf)
{
var bomSet = FindCharSetByBom(buf, offset, len);
var bomSet = FindCharSetByBom(buf);
if (bomSet != null)
{
_detectionDetail = new DetectionDetail(bomSet, 1.0f)
Expand All @@ -443,9 +453,9 @@ private bool IsStartsWithBom(byte[] buf, int offset, int len)
return false;
}

private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber)
private bool RunProber(ReadOnlySpan<byte> buf, CharsetProber charsetProber)
{
var probingState = charsetProber.HandleData(buf, offset, len);
var probingState = charsetProber.HandleData(buf);
if (probingState == ProbingState.FoundIt)
{
_detectionDetail = new DetectionDetail(charsetProber);
Expand All @@ -454,9 +464,9 @@ private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetPro
return false;
}

private void FindInputState(byte[] buf, int offset, int len)
private void FindInputState(ReadOnlySpan<byte> buf)
{
for (int i = offset; i < len; i++)
for (int i = 0; i < buf.Length; i++)
{
// other than 0xa0, if every other character is ascii, the page is ascii
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
Expand Down Expand Up @@ -485,59 +495,59 @@ private void FindInputState(byte[] buf, int offset, int len)
}
}

private static string FindCharSetByBom(byte[] buf, int offset, int len)
private static string FindCharSetByBom(ReadOnlySpan<byte> buf)
{
if (len < 2)
if (buf.Length < 2)
return null;

var buf0 = buf[offset + 0];
var buf1 = buf[offset + 1];
var buf0 = buf[0];
var buf1 = buf[1];

if (buf0 == 0xFE && buf1 == 0xFF)
{
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
return len > 3
&& buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
return buf.Length > 3
&& buf[2] == 0x00 && buf[3] == 0x00
? CodepageName.X_ISO_10646_UCS_4_3412
: CodepageName.UTF16_BE;
}

if (buf0 == 0xFF && buf1 == 0xFE)
{
return len > 3
&& buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
return buf.Length > 3
&& buf[2] == 0x00 && buf[3] == 0x00
? CodepageName.UTF32_LE
: CodepageName.UTF16_LE;
}

if (len < 3)
if (buf.Length < 3)
return null;

if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
if (buf0 == 0xEF && buf1 == 0xBB && buf[2] == 0xBF)
return CodepageName.UTF8;

if (len < 4)
if (buf.Length < 4)
return null;

//Here, because anyway further more than 3 positions are checked.
if (buf0 == 0x00 && buf1 == 0x00)
{
if (buf[offset + 2] == 0xFE && buf[offset + 3] == 0xFF)
if (buf[2] == 0xFE && buf[3] == 0xFF)
return CodepageName.UTF32_BE;

// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
if (buf[offset + 2] == 0xFF && buf[offset + 3] == 0xFE)
if (buf[2] == 0xFF && buf[3] == 0xFE)
return CodepageName.X_ISO_10646_UCS_4_2143;
}

// Detect utf-7 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
if (buf0 == 0x2B && buf1 == 0x2F && buf[2] == 0x76)
if (buf[3] == 0x38 || buf[3] == 0x39 || buf[3] == 0x2B || buf[3] == 0x2F)
return CodepageName.UTF7;

// Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
// TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
if (buf0 == 0x84 && buf1 == 0x31 && buf[2] == 0x95 && buf[3] == 0x33)
return CodepageName.GB18030;

return null;
Expand Down
12 changes: 6 additions & 6 deletions src/Core/Analyzers/CharDistributionAnalyser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
*
* ***** END LICENSE BLOCK ***** */

using System;

namespace UtfUnknown.Core.Analyzers;

/// <summary>
Expand Down Expand Up @@ -77,20 +79,18 @@ public CharDistributionAnalyser()
/// This allow multiple encoding of a language to share one frequency table
/// </remarks>
/// <param name="buf">A <see cref="System.Byte"/></param>
/// <param name="offset"></param>
/// <returns></returns>
public abstract int GetOrder(byte[] buf, int offset);
public abstract int GetOrder(ReadOnlySpan<byte> buf);

/// <summary>
/// Feed a character with known length
/// </summary>
/// <param name="buf">A <see cref="System.Byte"/></param>
/// <param name="offset">buf offset</param>
/// <param name="charLen">1 of 2 char length?</param>
public void HandleOneChar(byte[] buf, int offset, int charLen)
public void HandleOneChar(ReadOnlySpan<byte> buf, int charLen)
{
//we only care about 2-bytes character in our distribution analysis
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
int order = (charLen == 2) ? GetOrder(buf) : -1;
if (order >= 0)
{
totalChars++;
Expand Down Expand Up @@ -136,4 +136,4 @@ public bool GotEnoughData()
{
return totalChars > ENOUGH_DATA_THRESHOLD;
}
}
}
12 changes: 7 additions & 5 deletions src/Core/Analyzers/MultiByte/Chinese/BIG5DistributionAnalyser.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System;

namespace UtfUnknown.Core.Analyzers.Chinese;

public class BIG5DistributionAnalyser : CharDistributionAnalyser
Expand Down Expand Up @@ -914,13 +916,13 @@ public BIG5DistributionAnalyser()
/// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
/// no validation needed here. State machine has done that
/// </summary>
public override int GetOrder(byte[] buf, int offset)
public override int GetOrder(ReadOnlySpan<byte> buf)
{
if (buf[offset] >= 0xA4) {
if (buf[offset+1] >= 0xA1)
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
if (buf[0] >= 0xA4) {
if (buf[1] >= 0xA1)
return 157 * (buf[0] - 0xA4) + buf[1] - 0xA1 + 63;
else
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
return 157 * (buf[0] - 0xA4) + buf[1] - 0x40;
} else {
return -1;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System;

namespace UtfUnknown.Core.Analyzers.Chinese;

public class EUCTWDistributionAnalyser : CharDistributionAnalyser
Expand Down Expand Up @@ -417,11 +419,11 @@ public EUCTWDistributionAnalyser()
/// second byte range: 0xa1 -- 0xfe
/// no validation needed here. State machine has done that
/// </summary>
public override int GetOrder(byte[] buf, int offset)
public override int GetOrder(ReadOnlySpan<byte> buf)
{
if (buf[offset] >= 0xC4)
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
if (buf[0] >= 0xC4)
return 94 * (buf[0] - 0xC4) + buf[1] - 0xA1;
else
return -1;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System;

namespace UtfUnknown.Core.Analyzers.Chinese;

public class GB18030DistributionAnalyser : CharDistributionAnalyser
Expand Down Expand Up @@ -463,10 +465,10 @@ public GB18030DistributionAnalyser() : base()
/// no validation needed here. State machine has done that
/// </summary>
/// <returns></returns>
public override int GetOrder(byte[] buf, int offset)
public override int GetOrder(ReadOnlySpan<byte> buf)
{
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
if (buf[0] >= 0xB0 && buf[1] >= 0xA1)
return 94 * (buf[0] - 0xb0) + buf[1] - 0xA1;
else
return -1;
}
Expand Down
16 changes: 9 additions & 7 deletions src/Core/Analyzers/MultiByte/Japanese/EUCJPContextAnalyser.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
using System;

namespace UtfUnknown.Core.Analyzers.Japanese;

public class EUCJPContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0xA4;

protected override int GetOrder(byte[] buf, int offset, out int charLen)
protected override int GetOrder(ReadOnlySpan<byte> buf, out int charLen)
{
byte high = buf[offset];
byte high = buf[0];

//find out current char's byte length
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
Expand All @@ -18,21 +20,21 @@ protected override int GetOrder(byte[] buf, int offset, out int charLen)

// return its order if it is hiragana
if (high == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
byte low = buf[1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}

protected override int GetOrder(byte[] buf, int offset)
protected override int GetOrder(ReadOnlySpan<byte> buf)
{
// We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
byte low = buf[offset+1];
if (buf[0] == HIRAGANA_FIRST_BYTE) {
byte low = buf[1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System;

namespace UtfUnknown.Core.Analyzers.Japanese;

public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
Expand All @@ -7,10 +9,10 @@ public class EUCJPDistributionAnalyser : SJISDistributionAnalyser
/// second byte range: 0xa1 -- 0xfe
/// no validation needed here. State machine has done that
/// </summary>
public override int GetOrder(byte[] buf, int offset)
public override int GetOrder(ReadOnlySpan<byte> buf)
{
if (buf[offset] >= 0xA0)
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
if (buf[0] >= 0xA0)
return 94 * (buf[0] - 0xA1) + buf[1] - 0xA1;
else
return -1;
}
Expand Down
Loading
Loading