Introduction
While working with Identity Management issues, administrators often face a dilemma on re-formatting user's biographical data. When user's last names and titles are stored in undesirable case-insensitive format, Identity management admins have to re-format last names. Last names and titles present difficulties for first letter capitalization due to non-standard name spellings and spellings of acronyms.
What problem does this solution solve
"Proper case" capitalization of user biographical data or any other string.
How does this help someone else?
System administrators and Identity Management professionals could use this Format Provider to process data that is stored in a case-insensitive data-source.
Using the code
To use this format provider, the user should include the Lost and Found Identity Proper Case Format provider into their project and use it in the following way:
string improper = " MRs. De'MArLeY-Smith mccarthy IV Sr. PhD ";
Without McOption
string result = string.Format(new LafiProperCaseFormatProvider(), "{0:p}", improper);
Result:
Mrs. De'Marley-Smith Mccarthy IV Sr. PhD
With McOption
string result = string.Format(new LafiProperCaseFormatProvider(), "{0:mc}", improper);
Result:
Mrs. De'Marley-Smith McCarthy IV Sr. PhD
How does the code actually work
The Lost and Found Identity Proper Case Format Provider is an implementation of the IFormatProvider
interface.
The format provider splits strings on the "space" character, then removes any excess white space and applies several patterns for special capitalization rules (Roman Numerals, Salutations, and titles like PhD, etc.); thereafter, the string is split on "hyphens" and "apostrophes" to ensure proper capitalization of hyphenated words and compound words with apostrophe in the middle.
If the user specifies a case "m" or "mc" (McOption), Irish/Scottish names will be included into the pattern analysis. This option is particularly tricky, since it can produce undesirable results on non-Irish/Scottish names. Consider "MacDonald" vs. "Macado". The format provider will not capitalize Machado into "MaChado", which is generally undesirable. To solve this problem within the Identity Management project, you should use attribute flow precedence and a dedicated data-source which will contain the exceptions to case capitalization. (See my blog: http://kdmitry.spaces.live.com/blog/.)
Attention:
This code was designed and tested in the context of Identity Management proper case formatting. Applying this format provider to general text could produce undesirable results.
[assembly: System.CLSCompliant(true)]
namespace LostAndFoundIdentity.Text
{
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
[SuppressMessage("Microsoft.Naming",
"CA1704:IdentifiersShouldBeSpelledCorrectly",
MessageId = "Lafi",
Justification = "Nothing wrong with 'Lafi'; It is stands for Lost and Found Identity")]
public class LafiProperCaseFormatProvider : ICustomFormatter, IFormatProvider
{
#region Fields
private const string Space = " ";
private bool mcOption;
private Dictionary<pattern,> patternDictionary;
#endregion Fields
#region Constructors
public LafiProperCaseFormatProvider()
{
this.InitializeDictionary();
}
#endregion Constructors
#region Enums
private enum Pattern
{
None = 0,
AllUpperCase = 1,
FirstAndLastCapitals = 2,
McAndMac = 8,
RomanNumerals = 16,
Salutation = 32
}
#endregion Enums
#region Properties
private bool McOption
{
get
{
return this.mcOption;
}
set
{
this.mcOption = value;
}
}
private Dictionary PatternDictionary
{
get
{
return this.patternDictionary;
}
}
#endregion Properties
#region Interface implementation
public string Format(string format, object arg, IFormatProvider formatProvider)
{
string value = arg.ToString();
switch (format.ToUpperInvariant())
{
default:
{
return value;
}
case "M":
case "MAC":
case "MC":
{
this.McOption = true;
return this.FormatProperCase(value);
}
case "P":
{
this.McOption = false;
return this.FormatProperCase(value);
}
}
}
public object GetFormat(Type formatType)
{
if (formatType == typeof(ICustomFormatter))
{
return this;
}
else
{
return null;
}
}
#endregion
#region Methods
private static string ProcessWhitespace(string value)
{
value = value.Trim().TrimStart().TrimEnd();
value = Regex.Replace(value, @"\s+", Space);
return value;
}
private Pattern DetectPattern(string value)
{
foreach (KeyValuePair pair in this.PatternDictionary)
{
if (Regex.IsMatch(value, pair.Value,
RegexOptions.IgnoreCase |
RegexOptions.CultureInvariant))
{
return pair.Key;
}
}
return Pattern.None;
}
[SuppressMessage("Microsoft.Globalization",
"CA1308:NormalizeStringsToUppercase",
Justification = "By design")]
private string FormatProperCase(string value)
{
StringBuilder output = new StringBuilder();
value = ProcessWhitespace(value);
foreach (string token in value.ToLowerInvariant().Split(' '))
{
string tempToken = string.Empty;
Pattern pattern = this.DetectPattern(token);
switch (pattern)
{
case Pattern.Salutation:
{
tempToken = token.Substring(0, 1).ToUpperInvariant() +
token.Substring(1);
break;
}
case Pattern.FirstAndLastCapitals:
{
Match matchedToken = Regex.Match(token,
this.PatternDictionary[Pattern.FirstAndLastCapitals],
RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
tempToken = matchedToken.ToString().ToLowerInvariant();
tempToken = tempToken.Replace("p", "P");
tempToken = tempToken.Replace("l", "L");
tempToken = tempToken.Replace("d", "D");
break;
}
case Pattern.RomanNumerals:
case Pattern.AllUpperCase:
{
tempToken = token.ToUpperInvariant();
break;
}
case Pattern.McAndMac:
{
if (this.McOption)
{
Match matchedToken = Regex.Match(token,
this.PatternDictionary[Pattern.McAndMac],
RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
tempToken =
matchedToken.Groups[1].Value.Substring(0, 1).ToUpperInvariant();
tempToken += matchedToken.Groups[1].Value.Substring(1);
tempToken +=
matchedToken.Groups[2].Value.Substring(0, 1).ToUpperInvariant();
tempToken += matchedToken.Groups[2].Value.Substring(1);
}
else
{
tempToken = token.Substring(0, 1).ToUpperInvariant() +
token.Substring(1);
}
break;
}
case Pattern.None:
{
tempToken = token.Substring(0, 1).ToUpperInvariant() +
token.Substring(1);
break;
}
}
if (token.IndexOf(@"-", StringComparison.OrdinalIgnoreCase) > -1)
{
tempToken = this.FormatSeparatedValue(token, '-');
}
if (token.IndexOf(@"'", StringComparison.OrdinalIgnoreCase) > -1)
{
tempToken = this.FormatSeparatedValue(token, '\');
}
output.AppendFormat(CultureInfo.CurrentCulture,
"{0}{1}", tempToken, Space);
}
// Returning trimmed value
return output.ToString().Trim();
}
///
/// Formats "separated" string to ensure that hyphenated
/// and apostrophe-separated strings are properly capitalized
///
/// Value to be processed
/// A separator character
/// Properly formatted "separated" string
private string FormatSeparatedValue(string value, char separator)
{
string[] multiPartValue = value.Split(separator);
StringBuilder result = new StringBuilder();
int lastPart = multiPartValue.Length - 1;
for (int i = 0; i < lastPart; i++)
{
if (multiPartValue[i].Length == 0)
{
result.Append(separator.ToString());
}
else
{
result.AppendFormat(CultureInfo.InvariantCulture, "{0}{1}",
this.FormatProperCase(multiPartValue[i]),
separator.ToString(CultureInfo.InvariantCulture));
}
}
if (multiPartValue[lastPart].Length > 0)
{
result.Append(this.FormatProperCase(multiPartValue[lastPart]));
}
return result.ToString();
}
///
/// Initializes dictionary of pattern names and regex "formulas"
///
private void InitializeDictionary()
{
// a regular expression to define salutations for the proper case function
string salutations =
@"(^m(r|s)\.?$)|(^mrs\.?$)|(^mi(s){2}\.?$)|(^(j|s)r\.?,?$)";
// a regular expression string to match PhD or LegD and any variants with periods
string firstLastCap = @"(^leg\.?d\.?,?$)|(^ph\.?d\.?,?$)";
// a regular expression string that matches degrees and professional designations
//// and ensures that they are in all caps
//// this will match: MVP and MCP, DSC, CNA, CCNA
//// and CCNP, MCSE and MCSA and MCSD, CISM and CISA
//// DDS, RN, MD and OD, BA and MA, CISSP
string allUpperCase = @"(^m(v|c)p\,?\.?$)|(^dsc\.?\,?$)|(^cna\.?\," +
@"?$)|(^c{2}n(a|p)\.?\,?$)|(^mcs[ead]\.?\,?$)|(^cis(a|m\.?\,?)$)|" +
@"(^d{2}s$\.?\,?$)|(^rn\.?\,?$)|(^(m|o)\.?d\.?\,?$" +
@")|(^(b|m)\.?a\.?\,?$)|(^cis{2}p\.?\,?$)";
//// a regular expression to match the Mc's
string mcAndMac = @"^(ma?c)(?!s[ead]$)((.+))$";
string romanNumerals = @"^((?=[MDCLXVI])((M{0,3})((C[DM])|(D?" +
@"C{0,3}))?((X[LC])|(L?XX{0,2})|L)?((I[VX])|(V?(II{0,2}))|V)?)),?$";
this.patternDictionary = new Dictionary();
this.patternDictionary.Add(Pattern.AllUpperCase, allUpperCase);
this.patternDictionary.Add(Pattern.FirstAndLastCapitals, firstLastCap);
this.patternDictionary.Add(Pattern.McAndMac, mcAndMac);
this.patternDictionary.Add(Pattern.RomanNumerals, romanNumerals);
this.patternDictionary.Add(Pattern.Salutation, salutations);
}
#endregion Methods
}
}