I have checked the whole site and googled on the net but was unable to find a simple solution to this problem.
I have a datatable which has about 20 columns and 10K
Liggett78's answer is much better - esp. as mine had an error! Correction as follows...
DELETE TableWithDuplicates
FROM TableWithDuplicates
LEFT OUTER JOIN (
SELECT PK_ID = Min(PK_ID), --Decide your method for deciding which rows to keep
KeyColumn1,
KeyColumn2,
KeyColumn3,
KeyColumn4
FROM TableWithDuplicates
GROUP BY KeyColumn1,
KeyColumn2,
KeyColumn3,
KeyColumn4
) AS RowsToKeep
ON TableWithDuplicates.PK_ID = RowsToKeep.PK_ID
WHERE RowsToKeep.PK_ID IS NULL
"This datatable is being created by reading a CSV file and not from a DB."
So put a unique constraint on the four columns in the database, and inserts that are duplicates under your design won't go in. Unless it decides to fail instead of continuing when this happens, but this surely is configurable in your CSV import script.
Try this
Let us consider dtInput is your data table with duplicate records.
I have a new DataTable dtFinal in which I want to filter the duplicate rows.
So my code will be something like below.
DataTable dtFinal = dtInput.DefaultView.ToTable(true,
new string[ColumnCount] {"Col1Name","Col2Name","Col3Name",...,"ColnName"});
Use a query instead of functions:
DELETE FROM table1 AS tb1 INNER JOIN
(SELECT id, COUNT(id) AS cntr FROM table1 GROUP BY id) AS tb2
ON tb1.id = tb2.id WHERE tb2.cntr > 1
I wasn't keen on using the Linq solution above so I wrote this:
/// <summary>
/// Takes a datatable and a column index, and returns a datatable without duplicates
/// </summary>
/// <param name="dt">The datatable containing duplicate records</param>
/// <param name="ComparisonFieldIndex">The column index containing duplicates</param>
/// <returns>A datatable object without duplicated records</returns>
public DataTable duplicateRemoval(DataTable dt, int ComparisonFieldIndex)
{
try
{
//Build the new datatable that will be returned
DataTable dtReturn = new DataTable();
for (int i = 0; i < dt.Columns.Count; i++)
{
dtReturn.Columns.Add(dt.Columns[i].ColumnName, System.Type.GetType("System.String"));
}
//Loop through each record in the datatable we have been passed
foreach (DataRow dr in dt.Rows)
{
bool Found = false;
//Loop through each record already present in the datatable being returned
foreach (DataRow dr2 in dtReturn.Rows)
{
bool Identical = true;
//Compare the column specified to see if it matches an existing record
if (!(dr2[ComparisonFieldIndex].ToString() == dr[ComparisonFieldIndex].ToString()))
{
Identical = false;
}
//If the record found identically matches one we already have, don't add it again
if (Identical)
{
Found = true;
break;
}
}
//If we didn't find a matching record, we'll add this one
if (!Found)
{
DataRow drAdd = dtReturn.NewRow();
for (int i = 0; i < dtReturn.Columns.Count; i++)
{
drAdd[i] = dr[i];
}
dtReturn.Rows.Add(drAdd);
}
}
return dtReturn;
}
catch (Exception)
{
//Return the original datatable if something failed above
return dt;
}
}
Additionally, this works on ALL columns rather than a specific column index:
/// <summary>
/// Takes a datatable and returns a datatable without duplicates
/// </summary>
/// <param name="dt">The datatable containing duplicate records</param>
/// <returns>A datatable object without duplicated records</returns>
public DataTable duplicateRemoval(DataTable dt)
{
try
{
//Build the new datatable that will be returned
DataTable dtReturn = new DataTable();
for (int i = 0; i < dt.Columns.Count; i++)
{
dtReturn.Columns.Add(dt.Columns[i].ColumnName, System.Type.GetType("System.String"));
}
//Loop through each record in the datatable we have been passed
foreach (DataRow dr in dt.Rows)
{
bool Found = false;
//Loop through each record already present in the datatable being returned
foreach (DataRow dr2 in dtReturn.Rows)
{
bool Identical = true;
//Compare all columns to see if they match the existing record
for (int i = 0; i < dt.Columns.Count; i++)
{
if (!(dr2[i].ToString() == dr[i].ToString()))
{
Identical = false;
}
}
//If the record found identically matches one we already have, don't add it again
if (Identical)
{
Found = true;
break;
}
}
//If we didn't find a matching record, we'll add this one
if (!Found)
{
DataRow drAdd = dtReturn.NewRow();
for (int i = 0; i < dtReturn.Columns.Count; i++)
{
drAdd[i] = dr[i];
}
dtReturn.Rows.Add(drAdd);
}
}
return dtReturn;
}
catch (Exception)
{
//Return the original datatable if something failed above
return dt;
}
}
This is a very simple code which doesnot require linq nor individual columns to do the filter. If all the values of columns in a row are null it will be deleted.
public DataSet duplicateRemoval(DataSet dSet)
{
bool flag;
int ccount = dSet.Tables[0].Columns.Count;
string[] colst = new string[ccount];
int p = 0;
DataSet dsTemp = new DataSet();
DataTable Tables = new DataTable();
dsTemp.Tables.Add(Tables);
for (int i = 0; i < ccount; i++)
{
dsTemp.Tables[0].Columns.Add(dSet.Tables[0].Columns[i].ColumnName, System.Type.GetType("System.String"));
}
foreach (System.Data.DataRow row in dSet.Tables[0].Rows)
{
flag = false;
p = 0;
foreach (System.Data.DataColumn col in dSet.Tables[0].Columns)
{
colst[p++] = row[col].ToString();
if (!string.IsNullOrEmpty(row[col].ToString()))
{ //Display only if any of the data is present in column
flag = true;
}
}
if (flag == true)
{
DataRow myRow = dsTemp.Tables[0].NewRow();
//Response.Write("<tr style=\"background:#d2d2d2;\">");
for (int kk = 0; kk < ccount; kk++)
{
myRow[kk] = colst[kk];
// Response.Write("<td class=\"table-line\" bgcolor=\"#D2D2D2\">" + colst[kk] + "</td>");
}
dsTemp.Tables[0].Rows.Add(myRow);
}
} return dsTemp;
}
This can even be used to remove null data from excel sheet.