I recently needed a way to replace accented characters with simple English ones to allow more readable friendly URLs. I'm sure there are plenty of Danes out there who are sick of seeing their language butchered by UrlEncode. After a bit of reading up, it seems .NET 2.0 does 99% of the heavy lifting for you:
public static string ToSimpleCharacters(this string original)
{
if (string.IsNullOrEmpty(original)) return string.Empty;
string stFormD = original.Normalize(NormalizationForm.FormD);
StringBuilder sb = new StringBuilder();
for (int ich = 0; ich < stFormD.Length; ich++)
{
UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(stFormD[ich]);
if (uc != UnicodeCategory.NonSpacingMark)
{
if (Lookup.ContainsKey(stFormD[ich]))
{
sb.Append(Lookup[stFormD[ich]]);
}
else
{
sb.Append(stFormD[ich]);
}
}
}
return (sb.ToString().Normalize(NormalizationForm.FormC));
}
private static Dictionary<char, string> _lookup;
private static Dictionary<char, string> Lookup
{
get
{
if (_lookup == null)
{
_lookup = new Dictionary<char, string>();
_lookup[char.ConvertFromUtf32(230)[0]] = "ae";
_lookup[char.ConvertFromUtf32(198)[0]] = "Ae";
_lookup[char.ConvertFromUtf32(240)[0]] = "d";
}
return _lookup;
}
}
I’m sure that there must be a few substitutions that don’t get caught by this code. If you’ve got one, just drop me a line!