Background
A few days ago, I had to work with large data tables which may have more than 48000000 records. I had to split them into fixed size separate datatables and process them as needed. I found a number of ways to do split and today, I am going to share all possible ways considering efficiency.
DataTable Splitter
We are about to create a few splitter classes, that we are going to use in our helper class.
Base Splitter
using System;
using System.CodeDom;
using System.Collections.Generic;
using System.Data;
public abstract class DataTableSplitter
{
protected DataTable SourceTable;
protected int BatchSize;
protected string TableNamePrefix;
protected DataTableSplitter(DataTable table, int size, string namePrefix = "")
{
SourceTable = table;
BatchSize = size;
TableNamePrefix = namePrefix;
}
protected DataTable NewSplitTable(int tableCount)
{
DataTable dataTable = SourceTable.Clone();
dataTable.TableName = !string.IsNullOrEmpty(TableNamePrefix)
? TableNamePrefix + "_" + tableCount
: String.Empty;
dataTable.Clear();
return dataTable;
}
public abstract IEnumerable<DataTable> Split();
}
Regular Splitter (Without Making Any Change to the Source)
This splitter will create new datatables without introducing any change to source datatable, source datatable will be as it was.
public class DataTableRegularSplitter : DataTableSplitter
{
public DataTableRegularSplitter
(DataTable table, int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
foreach (DataRow row in SourceTable.Rows)
{
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
rowCount++;
if (rowCount == BatchSize)
{
yield return dataTable;
rowCount = 0;
dataTable = null;
}
}
if (dataTable != null && dataTable.Rows.Count > 0)
{
yield return dataTable;
}
}
}
Remove Splitter (Removing Rows From Source Using Remove Functionality)
This splitter will create new datatables and will include changes to source datatable
. At the end of the process, there will be no row at the source datatable. Remove()
method basically removes the row immediately.
public class DataTableRemoveSplitter : DataTableSplitter
{
public DataTableRemoveSplitter
(DataTable table, int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
while (SourceTable.Rows.Count > 0)
{
DataRow row = SourceTable.Rows[0];
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
SourceTable.Rows.Remove(row);
rowCount++;
if (rowCount == BatchSize)
{
yield return dataTable;
rowCount = 0;
dataTable = null;
}
}
if (dataTable != null && dataTable.Rows.Count > 0)
{
yield return dataTable;
}
}
}
Remove Splitter (Removing Rows From Source Using Delete Functionality)
This splitter is going to do the same as above. By calling Delete()
method, the row state becomes deleted and remains in the datatable
until we call AcceptChanges()
.
public class DataTableDeleteSplitter : DataTableSplitter
{
public DataTableDeleteSplitter(DataTable table,
int size, string namePrefix = "") : base(table, size, namePrefix)
{
}
public override IEnumerable<DataTable> Split()
{
SourceTable.AcceptChanges();
int tableCount = 0;
int rowCount = 0;
DataTable dataTable = null;
for (int sourceTableIndex = 0; SourceTable.Rows.Count > 0; sourceTableIndex++)
{
DataRow row = SourceTable.Rows[sourceTableIndex];
if (dataTable == null)
{
tableCount++;
dataTable = NewSplitTable(tableCount);
}
DataRow newRow = dataTable.NewRow();
newRow.ItemArray = row.ItemArray;
dataTable.Rows.Add(newRow);
row.Delete();
rowCount++;
if (rowCount == BatchSize)
{
SourceTable.AcceptChanges();
yield return dataTable;
rowCount = 0;
dataTable = null;
sourceTableIndex = -1;
}
}
if (dataTable != null && dataTable.Rows.Count > 0)
{
SourceTable.AcceptChanges();
yield return dataTable;
}
}
}
Remove Vs Delete
- Rows
Remove()
method removes dataRow
immediately and won't be recovered. Delete()
method waits for Acceptchanges()
call. Can be recovered by using Rejectchanges()
. Remove()
is faster than Delete()
. Delete()
is good with db attached datatables.
Split Helper Class
Here in this helper, we are going to use our slipper classes. We will also be able to include options on how to remove a row from the source datatable:
using System;
using System.Collections.Generic;
using System.Data;
public class DataTableHelper
{
public enum RowRemoveType
{
Remove,
Delete
}
public static IEnumerable<DataTable>
Split(DataTable sourceTable, int batchSize, string tableNamePrefix = "")
{
return new DataTableRegularSplitter(sourceTable, batchSize, tableNamePrefix).Split();
}
public static IEnumerable<DataTable> SplitAndRemoveFromSource
(DataTable sourceTable, int batchSize, string tableNamePrefix = "",
RowRemoveType rowRemoveType = RowRemoveType.Remove)
{
switch (rowRemoveType)
{
case RowRemoveType.Remove:
return new DataTableRemoveSplitter
(sourceTable, batchSize, tableNamePrefix).Split();
case RowRemoveType.Delete:
return new DataTableDeleteSplitter
(sourceTable, batchSize, tableNamePrefix).Split();
default:
throw new Exception("Unknown remove type");
}
}
}
Using Split Helper
Regular Split (Without Making Any Change to the Source)
DataTable table = Data.Table(3);
List<DataTable> tables = DataTableHelper.Split(table, 1).ToList();
Remove Split (Removing Rows From Source Using Remove Functionality)
DataTable table1 = Data.Table(3);
List<DataTable> tables1 = DataTableHelper.SplitAndRemoveFromSource(table1, 1).ToList();
Remove Split (Removing Rows From Source Using Delete Functionality)
DataTable table2 = Data.Table(3);
List<DataTable> tables2 = DataTableHelper.SplitAndRemoveFromSource
(table2, 1, "", DataTableHelper.RowRemoveType.Delete).ToList();
Generic Splitter
This generic splitter can split any IEnumerable
source object including any datatable
. This splitter will not include any change to the source object.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
public static class EnumerableExtension
{
public static IEnumerable<IEnumerable<T>> ToBatch<T>
(this IEnumerable<T> enumerable, int batchSize)
{
int itemsReturned = 0;
var list = enumerable.ToList();
int count = list.Count;
while (itemsReturned < count)
{
int currentChunkSize = Math.Min(batchSize, count - itemsReturned);
yield return list.GetRange(itemsReturned, currentChunkSize);
itemsReturned += currentChunkSize;
}
}
}
Using Generic Splitter
Here, we are using the generic splitter to split both datatable
and list
:
List<DataTable> tables3 = Data.Table(3).AsEnumerable()
.ToBatch(1)
.Select(x => x.CopyToDataTable()).ToList();
List<List<Student>> lists1 = Data.List(3).ToBatch(1).Select(x => x.ToList()).ToList();
Good to Know!!!
Please find the Visual Studio 2017 solution as an attachment. If I have missed anything or if you have any suggestions, just let me know.
History
- 19th May, 2019: Initial version