A StreamReader that can (to a certain degree) detect the encoding of a text file or stream and can especially differentiate between ANSI and UTF8 without byte order mark.
How to Detect the Encoding of a Text File
Disclaimer: The tip described here is not the magic bullet that does it all. There are a lot of codepages and encodings out there, that this cannot deal with. But if you are working in a Windows environment, this may be of help.
When dealing with text or csv files, I usually encounter files encoded in three different ways:
- ANSI or to be more precise: Windows-1252
- UTF8 with byte order mark (BOM)
- UTF8 without BOM
To read files, I used to use a StreamReader
like this:
public string ReadFile(string path)
{
using (var sr = new System.IO.StreamReader(path: path,
encoding: Encoding.Default,
detectEncodingFromByteOrderMarks: true))
{
return sr.ReadToEnd();
}
}
If a BOM is available, the StreamReader
will use it to get the correct encoding. In any other case, Default encoding (here Windows-1252) will be used. This works fine for the first two cases, it fails when UTF8 without BOM is used.
The problem is:
How to Differentiate Between ANSI and UTF8 Without BOM
One approach goes like this:
- Read a file with UTF8 encoding and catch the
DecoderFallbackException
. - In case such an Exception is thrown, read the file again using ANSI encoding which is likely to be the right choice.
However, this means the file or stream has to be read again and doing things again is something that should not be done!
Finally, I came up with another solution.
The FlexiStreamReader
The original StreamReader
does a pretty good job at choosing the right encoding based on a BOM, so we only need to deal with the case that a BOM is missing and the StreamReader
uses the given encoding.
Hence we need a new encoding for the StreamReader
and this is the EncodingProvider
:
This class is derived from System.Text.Encoding
and because it is only used to get character from the byte stream, we only need to implement the methods GetCharCount
, GetChars
and GetMaxCharCount
.
Inside these methods the DecoderFallbackException
is handled and here the actual Encoding switches from UTF8 to Default.
So when the StreamReader
uses the EncodingProvider
to read from a stream, it starts with UTF8 and as soon as the exception occurs switches to Default (Windows-1252). The stream remains unaffected, the position inside the stream does not need to be changed and therefore this will work for a forward-only steam as well.
Because the EncodingProvider
is for a very special purpose and some methods are not even implemented, it should not be a public class. Instead I choose to create the FlexiStreamReader
and make the EncodingProvider
a private class:
using System;
using System.Text;
using System.IO;
namespace MyClassLibrary
{
public class FlexiStreamReader : StreamReader
{
public FlexiStreamReader(Stream stream) :
base(stream, new EncodingProvider(), detectEncodingFromByteOrderMarks: true)
{
}
public FlexiStreamReader(string path) :
base(path, new EncodingProvider(), detectEncodingFromByteOrderMarks: true)
{
}
public override Encoding CurrentEncoding
{
get
{
var enc = base.CurrentEncoding as EncodingProvider;
if (enc != null)
{
return enc.InternalEncoding;
}
return base.CurrentEncoding;
}
}
private class EncodingProvider : System.Text.Encoding
{
private Encoding m_internalEncoding;
private bool m_useAnsi = false;
public EncodingProvider()
{
m_internalEncoding = new UTF8Encoding
(encoderShouldEmitUTF8Identifier: true, throwOnInvalidBytes: true);
}
public override int GetByteCount(char[] chars, int index, int count)
{
throw new NotImplementedException();
}
public override int GetBytes(char[] chars,
int charIndex, int charCount, byte[] bytes, int byteIndex)
{
throw new NotImplementedException();
}
public override int GetCharCount(byte[] bytes, int index, int count)
{
try
{
return m_internalEncoding.GetCharCount(bytes, index, count);
}
catch (DecoderFallbackException)
{
if (m_useAnsi)
{
throw;
}
m_useAnsi = true;
m_internalEncoding = System.Text.Encoding.Default;
return m_internalEncoding.GetCharCount(bytes, index, count);
}
}
public override int GetChars(byte[] bytes, int byteIndex,
int byteCount, char[] chars, int charIndex)
{
try
{
return m_internalEncoding.GetChars
(bytes, byteIndex, byteCount, chars, charIndex);
}
catch (DecoderFallbackException)
{
if (m_useAnsi)
{
throw;
}
m_useAnsi = true;
m_internalEncoding = System.Text.Encoding.Default;
return m_internalEncoding.GetChars
(bytes, byteIndex, byteCount, chars, charIndex);
}
}
public override int GetMaxByteCount(int charCount)
{
try
{
return m_internalEncoding.GetMaxByteCount(charCount);
}
catch (DecoderFallbackException)
{
if (m_useAnsi)
{
throw;
}
m_useAnsi = true;
m_internalEncoding = System.Text.Encoding.Default;
return m_internalEncoding.GetMaxByteCount(charCount);
}
}
public override int GetMaxCharCount(int byteCount)
{
try
{
return m_internalEncoding.GetMaxCharCount(byteCount);
}
catch (DecoderFallbackException)
{
if (m_useAnsi)
{
throw;
}
m_useAnsi = true;
m_internalEncoding = System.Text.Encoding.Default;
return m_internalEncoding.GetMaxCharCount(byteCount);
}
}
public Encoding InternalEncoding
{
get { return m_internalEncoding; }
}
}
}
}
Using the Code
Copy the class FlexiStreamReader
to your project, adjust the namespace and use it:
public string ReadFile(string path)
{
using (var sr = new FlexiStreamReader(path))
{
return sr.ReadToEnd();
}
}
If you need to know the encoding, you can read it after reading the stream:
public string ReadFile(string path)
{
using (var sr = new FlexiStreamReader(path))
{
var result = sr.ReadToEnd();
Debug.WriteLine(sr.CurrentEncoding.EncodingName);
return result;
}
}
Unit Tests
In case you have any doubts that this will work...
Here are some tests for the FlexiStreamReader
you can add to your test project:
using Microsoft.VisualStudio.TestTools.UnitTesting;
using MyClassLibrary;
using System;
using System.IO;
using System.Text;
namespace MyClassLibrary.Tests
{
[TestClass()]
public class FlexiStreamReaderTests
{
[TestMethod]
public void ReadUTF8()
{
var text = "abcdäöü";
var result = string.Empty;
using (var s = GetStream(new UTF8Encoding(false), text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.UTF8.EncodingName,
r.CurrentEncoding.EncodingName);
}
}
[TestMethod]
public void ReadUTF8_long()
{
var text = GetTestString(10000);
var result = string.Empty;
using (var s = GetStream(new UTF8Encoding(false), text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.UTF8.EncodingName,
r.CurrentEncoding.EncodingName);
}
}
[TestMethod]
public void ReadUTF8_BOM()
{
var text = "abcdäöü";
var result = string.Empty;
using (var s = GetStream(new UTF8Encoding(true), text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.UTF8, r.CurrentEncoding);
}
}
[TestMethod]
public void ReadAnsi()
{
var text = "abcdäöü";
var result = string.Empty;
using (var s = GetStream(Encoding.Default, text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.Default, r.CurrentEncoding);
}
}
[TestMethod]
public void ReadUnicode()
{
var text = "abcdäöü";
var result = string.Empty;
using (var s = GetStream(Encoding.Unicode, text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.Unicode, r.CurrentEncoding);
}
}
[TestMethod]
public void ReadBigEndianUnicode()
{
var text = "abcdäöü";
var result = string.Empty;
using (var s = GetStream(Encoding.BigEndianUnicode, text))
using (var r = new FlexiStreamReader(s))
{
result = r.ReadToEnd();
Assert.AreEqual(text, result);
Assert.AreEqual(Encoding.BigEndianUnicode, r.CurrentEncoding);
}
}
private static Stream GetStream(Encoding enc, String text)
{
var ms = new MemoryStream();
var sw = new StreamWriter(ms, enc);
sw.Write(text);
sw.Flush();
ms.Position = 0;
return ms;
}
private static string GetTestString(int length)
{
var l2 = Math.Min(length / 10, 10);
var l1 = length - l2;
var p1 = new string('a', l1);
var p2 = new string('ö', l2);
return string.Concat(p1, p2);
}
}
}
History
- 19th April, 2023: Initial version