I am using the accepted solution here to convert an excel sheet into a datatable. This works fine if I have \"perfect\" data but if I have a blank cell in the middle of my
Okay, I'm not exactly an expert on this but the other answers do seem like over kill to me so here's my solution:
// Loop through each row in the spreadsheet, skipping the header row
foreach (var row in sheetData.Elements<Row>().Skip(1))
{
var i = 0;
string[] letters = new string[15] {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O" };
List<String> cellsList = new List<string>();
foreach (var cell in row.Elements<Cell>().ToArray())
{
while (cell.CellReference.ToString()[0] != Convert.ToChar(letters[i]))
{//accounts for multiple consecutive blank cells
cellsList.Add("");
i++;
}
cellsList.Add(cell.CellValue.Text);
i++;
}
string[] cells = cellsList.ToArray();
foreach(var cell in cellsList)
{
//display contents of cell, depending on the datatype you may need to call each of the cells manually
}
}
Hope someone finds this useful!
You can use this function to extract a cell from a row passing the header index:
public static Cell GetCellFromRow(Row r ,int headerIdx) {
string cellname = GetNthColumnName(headerIdx) + r.RowIndex.ToString();
IEnumerable<Cell> cells = r.Elements<Cell>().Where(x=> x.CellReference == cellname);
if (cells.Count() > 0)
{
return cells.First();
}
else {
return null;
}
}
public static string GetNthColumnName(int n)
{
string name = "";
while (n > 0)
{
n--;
name = (char)('A' + n % 26) + name;
n /= 26;
}
return name;
}
With apologies for posting yet another answer to this question, here's the code I used.
I was having problems with OpenXML not working properly if a worksheet had a blank row at the top. It would sometimes just return a DataTable with 0 rows and 0 columns in it. The code below copes with this, and all other worksheets.
Here's how you would call my code. Just pass in a filename and the name of the Worksheet to read in:
DataTable dt = OpenXMLHelper.ExcelWorksheetToDataTable("C:\\SQL Server\\SomeExcelFile.xlsx", "Mikes Worksheet");
And here's the code itself:
public class OpenXMLHelper
{
// A helper function to open an Excel file using OpenXML, and return a DataTable containing all the data from one
// of the worksheets.
//
// We've had lots of problems reading in Excel data using OLEDB (eg the ACE drivers no longer being present on new servers,
// OLEDB not working due to security issues, and blatantly ignoring blank rows at the top of worksheets), so this is a more
// stable method of reading in the data.
//
public static DataTable ExcelWorksheetToDataTable(string pathFilename, string worksheetName)
{
DataTable dt = new DataTable(worksheetName);
using (SpreadsheetDocument document = SpreadsheetDocument.Open(pathFilename, false))
{
// Find the sheet with the supplied name, and then use that
// Sheet object to retrieve a reference to the first worksheet.
Sheet theSheet = document.WorkbookPart.Workbook.Descendants<Sheet>().Where(s => s.Name == worksheetName).FirstOrDefault();
if (theSheet == null)
throw new Exception("Couldn't find the worksheet: " + worksheetName);
// Retrieve a reference to the worksheet part.
WorksheetPart wsPart = (WorksheetPart)(document.WorkbookPart.GetPartById(theSheet.Id));
Worksheet workSheet = wsPart.Worksheet;
string dimensions = workSheet.SheetDimension.Reference.InnerText; // Get the dimensions of this worksheet, eg "B2:F4"
int numOfColumns = 0;
int numOfRows = 0;
CalculateDataTableSize(dimensions, ref numOfColumns, ref numOfRows);
System.Diagnostics.Trace.WriteLine(string.Format("The worksheet \"{0}\" has dimensions \"{1}\", so we need a DataTable of size {2}x{3}.", worksheetName, dimensions, numOfColumns, numOfRows));
SheetData sheetData = workSheet.GetFirstChild<SheetData>();
IEnumerable<Row> rows = sheetData.Descendants<Row>();
string[,] cellValues = new string[numOfColumns, numOfRows];
int colInx = 0;
int rowInx = 0;
string value = "";
SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart;
// Iterate through each row of OpenXML data, and store each cell's value in the appropriate slot in our [,] string array.
foreach (Row row in rows)
{
for (int i = 0; i < row.Descendants<Cell>().Count(); i++)
{
// *DON'T* assume there's going to be one XML element for each column in each row...
Cell cell = row.Descendants<Cell>().ElementAt(i);
if (cell.CellValue == null || cell.CellReference == null)
continue; // eg when an Excel cell contains a blank string
// Convert this Excel cell's CellAddress into a 0-based offset into our array (eg "G13" -> [6, 12])
colInx = GetColumnIndexByName(cell.CellReference); // eg "C" -> 2 (0-based)
rowInx = GetRowIndexFromCellAddress(cell.CellReference)-1; // Needs to be 0-based
// Fetch the value in this cell
value = cell.CellValue.InnerXml;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
value = stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText;
}
cellValues[colInx, rowInx] = value;
}
}
// Copy the array of strings into a DataTable.
// We don't (currently) make any attempt to work out which columns should be numeric, rather than string.
for (int col = 0; col < numOfColumns; col++)
dt.Columns.Add("Column_" + col.ToString());
for (int row = 0; row < numOfRows; row++)
{
DataRow dataRow = dt.NewRow();
for (int col = 0; col < numOfColumns; col++)
{
dataRow.SetField(col, cellValues[col, row]);
}
dt.Rows.Add(dataRow);
}
#if DEBUG
// Write out the contents of our DataTable to the Output window (for debugging)
string str = "";
for (rowInx = 0; rowInx < maxNumOfRows; rowInx++)
{
for (colInx = 0; colInx < maxNumOfColumns; colInx++)
{
object val = dt.Rows[rowInx].ItemArray[colInx];
str += (val == null) ? "" : val.ToString();
str += "\t";
}
str += "\n";
}
System.Diagnostics.Trace.WriteLine(str);
#endif
return dt;
}
}
private static void CalculateDataTableSize(string dimensions, ref int numOfColumns, ref int numOfRows)
{
// How many columns & rows of data does this Worksheet contain ?
// We'll read in the Dimensions string from the Excel file, and calculate the size based on that.
// eg "B1:F4" -> we'll need 6 columns and 4 rows.
//
// (We deliberately ignore the top-left cell address, and just use the bottom-right cell address.)
try
{
string[] parts = dimensions.Split(':'); // eg "B1:F4"
if (parts.Length != 2)
throw new Exception("Couldn't find exactly *two* CellAddresses in the dimension");
numOfColumns = 1 + GetColumnIndexByName(parts[1]); // A=1, B=2, C=3 (1-based value), so F4 would return 6 columns
numOfRows = GetRowIndexFromCellAddress(parts[1]);
}
catch
{
throw new Exception("Could not calculate maximum DataTable size from the worksheet dimension: " + dimensions);
}
}
public static int GetRowIndexFromCellAddress(string cellAddress)
{
// Convert an Excel CellReference column into a 1-based row index
// eg "D42" -> 42
// "F123" -> 123
string rowNumber = System.Text.RegularExpressions.Regex.Replace(cellAddress, "[^0-9 _]", "");
return int.Parse(rowNumber);
}
public static int GetColumnIndexByName(string cellAddress)
{
// Convert an Excel CellReference column into a 0-based column index
// eg "D42" -> 3
// "F123" -> 5
var columnName = System.Text.RegularExpressions.Regex.Replace(cellAddress, "[^A-Z_]", "");
int number = 0, pow = 1;
for (int i = columnName.Length - 1; i >= 0; i--)
{
number += (columnName[i] - 'A' + 1) * pow;
pow *= 26;
}
return number - 1;
}
}
The letter code is a base 26 encoding so this should work to convert it into an offset.
// Converts letter code (i.e. AA) to an offset
public int offset( string code)
{
var offset = 0;
var byte_array = Encoding.ASCII.GetBytes( code ).Reverse().ToArray();
for( var i = 0; i < byte_array.Length; i++ )
{
offset += (byte_array[i] - 65 + 1) * Convert.ToInt32(Math.Pow(26.0, Convert.ToDouble(i)));
}
return offset - 1;
}
Here's a slightly modified version of Waylon's answer which also relied on other answers. It encapsulates his method in a class.
I changed
IEnumerator<Cell> GetEnumerator()
to
IEnumerable<Cell> GetRowCells(Row row)
Here's the class, you don't need to instantiate it, it just serves as an utility class:
public class SpreedsheetHelper
{
///<summary>returns an empty cell when a blank cell is encountered
///</summary>
public static IEnumerable<Cell> GetRowCells(Row row)
{
int currentCount = 0;
foreach (DocumentFormat.OpenXml.Spreadsheet.Cell cell in
row.Descendants<DocumentFormat.OpenXml.Spreadsheet.Cell>())
{
string columnName = GetColumnName(cell.CellReference);
int currentColumnIndex = ConvertColumnNameToNumber(columnName);
for (; currentCount < currentColumnIndex; currentCount++)
{
yield return new DocumentFormat.OpenXml.Spreadsheet.Cell();
}
yield return cell;
currentCount++;
}
}
/// <summary>
/// Given a cell name, parses the specified cell to get the column name.
/// </summary>
/// <param name="cellReference">Address of the cell (ie. B2)</param>
/// <returns>Column Name (ie. B)</returns>
public static string GetColumnName(string cellReference)
{
// Match the column name portion of the cell name.
var regex = new System.Text.RegularExpressions.Regex("[A-Za-z]+");
var match = regex.Match(cellReference);
return match.Value;
}
/// <summary>
/// Given just the column name (no row index),
/// it will return the zero based column index.
/// </summary>
/// <param name="columnName">Column Name (ie. A or AB)</param>
/// <returns>Zero based index if the conversion was successful</returns>
/// <exception cref="ArgumentException">thrown if the given string
/// contains characters other than uppercase letters</exception>
public static int ConvertColumnNameToNumber(string columnName)
{
var alpha = new System.Text.RegularExpressions.Regex("^[A-Z]+$");
if (!alpha.IsMatch(columnName)) throw new ArgumentException();
char[] colLetters = columnName.ToCharArray();
Array.Reverse(colLetters);
int convertedValue = 0;
for (int i = 0; i < colLetters.Length; i++)
{
char letter = colLetters[i];
int current = i == 0 ? letter - 65 : letter - 64; // ASCII 'A' = 65
convertedValue += current * (int)Math.Pow(26, i);
}
return convertedValue;
}
}
Now you're able to get all rows' cells in this way:
// skip the part that retrieves the worksheet sheetData
IEnumerable<Row> rows = sheetData.Descendants<Row>();
foreach(Row row in rows)
{
IEnumerable<Cell> cells = SpreedsheetHelper.GetRowCells(row);
foreach (Cell cell in cells)
{
// skip part that reads the text according to the cell-type
}
}
It will contain all cells even if they are empty.
See my implementation:
Row[] rows = worksheet.GetFirstChild<SheetData>()
.Elements<Row>()
.ToArray();
string[] columnNames = rows.First()
.Elements<Cell>()
.Select(cell => GetCellValue(cell, document))
.ToArray();
HeaderLetters = ExcelHeaderHelper.GetHeaderLetters((uint)columnNames.Count());
if (columnNames.Count() != HeaderLetters.Count())
{
throw new ArgumentException("HeaderLetters");
}
IEnumerable<List<string>> cellValues = GetCellValues(rows.Skip(1), columnNames.Count(), document);
//Here you can enumerate through the cell values, based on the cell index the column names can be retrieved.
HeaderLetters are collected using this class:
private static class ExcelHeaderHelper
{
public static string[] GetHeaderLetters(uint max)
{
var result = new List<string>();
int i = 0;
var columnPrefix = new Queue<string>();
string prefix = null;
int prevRoundNo = 0;
uint maxPrefix = max / 26;
while (i < max)
{
int roundNo = i / 26;
if (prevRoundNo < roundNo)
{
prefix = columnPrefix.Dequeue();
prevRoundNo = roundNo;
}
string item = prefix + ((char)(65 + (i % 26))).ToString(CultureInfo.InvariantCulture);
if (i <= maxPrefix)
{
columnPrefix.Enqueue(item);
}
result.Add(item);
i++;
}
return result.ToArray();
}
}
And the helper methods are:
private static IEnumerable<List<string>> GetCellValues(IEnumerable<Row> rows, int columnCount, SpreadsheetDocument document)
{
var result = new List<List<string>>();
foreach (var row in rows)
{
List<string> cellValues = new List<string>();
var actualCells = row.Elements<Cell>().ToArray();
int j = 0;
for (int i = 0; i < columnCount; i++)
{
if (actualCells.Count() <= j || !actualCells[j].CellReference.ToString().StartsWith(HeaderLetters[i]))
{
cellValues.Add(null);
}
else
{
cellValues.Add(GetCellValue(actualCells[j], document));
j++;
}
}
result.Add(cellValues);
}
return result;
}
private static string GetCellValue(Cell cell, SpreadsheetDocument document)
{
bool sstIndexedcell = GetCellType(cell);
return sstIndexedcell
? GetSharedStringItemById(document.WorkbookPart, Convert.ToInt32(cell.InnerText))
: cell.InnerText;
}
private static bool GetCellType(Cell cell)
{
return cell.DataType != null && cell.DataType == CellValues.SharedString;
}
private static string GetSharedStringItemById(WorkbookPart workbookPart, int id)
{
return workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(id).InnerText;
}
The solution deals with shared cell items (SST indexed cells).