Introduction
When it comes to most data centric systems, there is often a need to import and parse CSV files and export that to databases like Oracle, SQL Server. .NET actually has a built in CSV parser, although it is well hidden in a VB.NET namespace. It is just a basic one and doesn’t provide direct data interoperability. Most of the time, programmers take this CSV parsing job as a simple task and end up in doing much more complex logic and a module which cannot go with inbuilt .NET databinding and ADO.NET libraries.
This usually results in a library that is not portable, low in efficiency and also does not go with requirements like handling column names to match with a database field. The library is created as an effort to create a generalized module to support most common requirements which a CSV parser can come across.
Features
- Follows the
IDataReader
interface which adds capabilities to use it in conjunction with common ADO.NET libraries like SQLBulkcopy
. - Module contains provisions to add an additional dummy field if required, rename an existing header to match the database table.
- The module provides a blazing fast algorithm to read through CSV files of huge sizes. Testing this module against 2GB files showed performance equal or better than any of the libraries found in web until today.
Overview of Methods
- Implements
IDataReader
pattern and usage is same as any datareader
(SQLDataReader
, OracleDataReader
). AddVirtualColumn
– Adds a new virtual column at the beginning of each CSV row. RenameCSVHeader
- Renames virtual column header of CSV row. UpdateVirtualColumnValue
- Updates the value of the virtual column.
Process Flow
Using the Code
CSVDataReader rdr = new CSVDataReader(@"..\..\..\Data.csv");
rdr.AddVirtualColumn("Virtual_Column1", "1");
string header = "";
foreach (var item in rdr.Header)
{
header = header + " - " + item;
}
Console.WriteLine(header);
int i = 1;
while (rdr.Read())
{
Console.WriteLine(rdr[0] + " - " + rdr[rdr.Header[1]] + " - " +
rdr[rdr.Header[2]] + " - " + rdr[rdr.Header[3]]);
rdr.UpdateVirtualColumnValue("Virtual_Column1", (i++).ToString());
rdr.Read();
}
Console.ReadLine();
rdr.Close();
Console.Clear();
rdr = new CSVDataReader(@"..\..\..\Data.csv");
System.Data.SqlClient.SqlBulkCopy bcp =
new SqlBulkCopy(destconnectionStr, SqlBulkCopyOptions.UseInternalTransaction);
bcp.BatchSize = 500;
bcp.DestinationTableName = destinationTableName;
bcp.NotifyAfter = 500;
bcp.SqlRowsCopied += (sender,e) =>
{
Console.WriteLine("Written: " + e.RowsCopied.ToString());
};
bcp.WriteToServer(rdr);
Implementation
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Data;
namespace System.Data
{
public class CSVDataReader : IDataReader, IDisposable
{
private StreamReader _file;
private char _delimiter;
private string _virtualHeaderString = "", _csvHeaderstring = "",
_csvlinestring = "", _virtuallineString = "";
private bool _firstRowHeader = true;
private string[] _header;
public string[] Header
{
get { return _header; }
}
private System.Collections.Specialized.OrderedDictionary headercollection =
new System.Collections.Specialized.OrderedDictionary();
private string[] _line;
public string[] Line
{
get
{
return _line;
}
}
private int recordsaffected;
private bool _iscolumnlocked = false;
public CSVDataReader(string filePath, char delimiter = ',', bool firstRowHeader = true)
{
_file = File.OpenText(filePath);
_delimiter = delimiter;
_firstRowHeader = firstRowHeader;
if (_firstRowHeader == true)
{
Read();
_csvHeaderstring = _csvlinestring;
_header = ReadRow(_csvHeaderstring);
foreach (var item in _header)
{
if (headercollection.Contains(item) == true)
throw new Exception("Duplicate found in CSV header.
Cannot create a CSV reader instance with duplicate header");
headercollection.Add(item, null);
}
}
else
{
Read();
_csvHeaderstring = _csvlinestring;
_header = ReadRow(_csvHeaderstring);
int i = 0;
_csvHeaderstring = "";
foreach (var item in _header)
{
headercollection.Add("COL_" + i.ToString(), null);
_csvHeaderstring = _csvHeaderstring + "COL_" + i.ToString() + _delimiter;
i++;
}
_csvHeaderstring.TrimEnd(_delimiter);
_header = ReadRow(_csvHeaderstring);
Close();
_file = File.OpenText(filePath);
}
_iscolumnlocked = false;
_csvlinestring = "";
_line = null;
recordsaffected = 0;
}
public bool Read()
{
var result = !_file.EndOfStream;
if (result == true)
{
_csvlinestring = _file.ReadLine();
if (_virtuallineString == "")
_line = ReadRow(_csvlinestring);
else
_line = ReadRow(_virtuallineString + _delimiter + _csvlinestring);
recordsaffected++;
}
if (_iscolumnlocked == false)
_iscolumnlocked = true;
return result;
}
public bool AddVirtualColumn(string columnName, string value)
{
if (value == null)
return false;
if (_iscolumnlocked == true)
throw new Exception("Cannot add new records after Read() is called.");
if (headercollection.Contains(columnName) == true)
throw new Exception("Duplicate found in CSV header.
Cannot create a CSV readerinstance with duplicate header");
headercollection.Add(columnName, value);
if (_virtualHeaderString == "")
_virtualHeaderString = columnName;
else
_virtualHeaderString = columnName + _delimiter + _virtualHeaderString;
_header = ReadRow(_virtualHeaderString + _delimiter + _csvHeaderstring);
if (_virtuallineString == "")
_virtuallineString = value;
else
_virtuallineString = value + _delimiter + _virtuallineString;
_line = ReadRow(_virtuallineString + _delimiter + _csvlinestring);
return true;
}
public bool RenameCSVHeader(string oldColumnName, string newColumnName)
{
if (_iscolumnlocked == true)
throw new Exception("Cannot update header after Read() is called.");
if (headercollection.Contains(oldColumnName) == false)
throw new Exception("CSV header not found. Cannot update.");
string value = headercollection[oldColumnName] == null ?
null : headercollection[oldColumnName].ToString();
int i = 0;
foreach (var item in headercollection.Keys)
{
if (item.ToString() == oldColumnName)
break;
i++;
}
headercollection.RemoveAt(i);
headercollection.Insert(i, newColumnName, value);
if (value == null)
{
_csvHeaderstring = _csvHeaderstring.Replace(oldColumnName, newColumnName);
_header = ReadRow(_virtualHeaderString + _delimiter + _csvHeaderstring);
}
else
{
_virtualHeaderString = _virtualHeaderString.Replace(oldColumnName, newColumnName);
_header = ReadRow(_virtualHeaderString + _delimiter + _csvHeaderstring);
}
return true;
}
public bool UpdateVirtualColumnValue(string columnName, string value)
{
if (value == null)
return false;
if (headercollection.Contains(columnName) == false)
throw new Exception("Unable to find the csv header. Cannot update value.");
if (headercollection.Contains(columnName) == true && headercollection[columnName] == null)
throw new Exception("Cannot update values for default csv based columns.");
headercollection[columnName] = value;
_virtuallineString = "";
foreach (var item in headercollection.Values)
{
if (item != null)
{
_virtuallineString = (string)item + _delimiter + _virtuallineString;
}
}
_virtuallineString = _virtuallineString.TrimEnd(',');
_line = ReadRow(_virtuallineString + _delimiter + _csvlinestring);
return true;
}
private string[] ReadRow(string line)
{
List<string> lines = new List<string>();
if (String.IsNullOrEmpty(line) == true)
return null;
int pos = 0;
int rows = 0;
while (pos < line.Length)
{
string value;
if (line[pos] == '"')
{
pos++;
int start = pos;
while (pos < line.Length)
{
if (line[pos] == '"')
{
pos++;
if (pos >= line.Length || line[pos] != '"')
{
pos--;
break;
}
}
pos++;
}
value = line.Substring(start, pos - start);
value = value.Replace("\"\"", "\"");
}
else
{
int start = pos;
while (pos < line.Length && line[pos] != _delimiter)
pos++;
value = line.Substring(start, pos - start);
}
if (rows < lines.Count)
lines[rows] = value;
else
lines.Add(value);
rows++;
while (pos < line.Length && line[pos] != _delimiter)
pos++;
if (pos < line.Length)
pos++;
}
return lines.ToArray();
}
public void Close()
{
_file.Close();
_file.Dispose();
_file = null;
}
public int Depth
{
get { return 1; }
}
public DataTable GetSchemaTable()
{
DataTable t = new DataTable();
t.Rows.Add(Header);
return t;
}
public bool IsClosed
{
get { return _file == null; }
}
public bool NextResult()
{
return Read();
}
public int RecordsAffected
{
get { return recordsaffected; }
}
public void Dispose()
{
if (_file != null)
{
_file.Dispose();
_file = null;
}
}
public int FieldCount
{
get { return Header.Length; }
}
public bool GetBoolean(int i)
{
return Boolean.Parse(Line[i]);
}
public byte GetByte(int i)
{
return Byte.Parse(Line[i]);
}
public long GetBytes(int i, long fieldOffset, byte[] buffer, int bufferoffset, int length)
{
throw new NotImplementedException();
}
public char GetChar(int i)
{
return Char.Parse(Line[i]);
}
public long GetChars(int i, long fieldoffset, char[] buffer, int bufferoffset, int length)
{
throw new NotImplementedException();
}
public IDataReader GetData(int i)
{
return (IDataReader)this;
}
public string GetDataTypeName(int i)
{
throw new NotImplementedException();
}
public DateTime GetDateTime(int i)
{
return DateTime.Parse(Line[i]);
}
public decimal GetDecimal(int i)
{
return Decimal.Parse(Line[i]);
}
public double GetDouble(int i)
{
return Double.Parse(Line[i]);
}
public Type GetFieldType(int i)
{
return typeof(String);
}
public float GetFloat(int i)
{
return float.Parse(Line[i]);
}
public Guid GetGuid(int i)
{
return Guid.Parse(Line[i]);
}
public short GetInt16(int i)
{
return Int16.Parse(Line[i]);
}
public int GetInt32(int i)
{
return Int32.Parse(Line[i]);
}
public long GetInt64(int i)
{
return Int64.Parse(Line[i]);
}
public string GetName(int i)
{
return Header[i];
}
public int GetOrdinal(string name)
{
int result = -1;
for (int i = 0; i < Header.Length; i++)
if (Header[i] == name)
{
result = i;
break;
}
return result;
}
public string GetString(int i)
{
return Line[i];
}
public object GetValue(int i)
{
return Line[i];
}
public int GetValues(object[] values)
{
values = Line;
return 1;
}
public bool IsDBNull(int i)
{
return string.IsNullOrWhiteSpace(Line[i]);
}
public object this[string name]
{
get { return Line[GetOrdinal(name)]; }
}
public object this[int i]
{
get { return GetValue(i); }
}
}
}
History
The complete solution is attached for easy reference. Comments and bug references are welcome.