I want to extract some data like \" email addresses \"
.. from table which are in PDF file and use this email addresses which I extract to send email to those peopl
Using bytescout PDF Extractor SDK we can be able to extract the whole page to csv as below.
CSVExtractor extractor = new CSVExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
TableDetector tdetector = new TableDetector();
tdetector.RegistrationKey = "demo";
tdetector.RegistrationName = "demo";
// Load the document
extractor.LoadDocumentFromFile("C:\\sample.pdf");
tdetector.LoadDocumentFromFile("C:\\sample.pdf");
int pageCount = tdetector.GetPageCount();
for (int i = 1; i <= pageCount; i++)
{
int j = 1;
do
{
extractor.SetExtractionArea(tdetector.GetPageRect_Left(i),
tdetector.GetPageRect_Top(i),
tdetector.GetPageRect_Width(i),
tdetector.GetPageRect_Height(i)
);
// and finally save the table into CSV file
extractor.SavePageCSVToFile(i, "C:\\page-" + i + "-table-" + j + ".csv");
j++;
} while (tdetector.FindNextTable()); // search next table
}
You absolutely do not have to convert PDF to Excel. First of all, please determine whether your PDF contains textual data, or it is scanned image. If it contains textual data, then you are right about using "some free dll". I recommend iTextSharp as it is popular and easy to use.
Now the controversial part. If you don't need rock solid solution, it would be easiest to read all PDF to a string and then retrieve emails using regular expression.
Here is example (not perfect) of reading PDF with iTextSharp and extracting emails:
public string PdfToString(string fileName)
{
var sb = new StringBuilder();
var reader = new PdfReader(fileName);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var strategy = new SimpleTextExtractionStrategy();
string text = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));
sb.Append(text);
}
reader.Close();
return sb.ToString();
}
//adjust expression as needed
Regex emailRegex = new Regex("Email Address (?<email>.+?) Passport No");
public IEnumerable<string> ExtractEmails(string content)
{
var matches = emailRegex.Matches(content);
foreach (Match m in matches)
{
yield return m.Groups["email"].Value;
}
}
public void Convert(string fileNames) {
int pageCount = 0;
iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(fileNames);
pageCount = reader.NumberOfPages;
string ext = System.IO.Path.GetExtension(fileNames);
//string[] outfiles = new string[pageCount];
//Excel.Application app = new Excel.Application();
//app.Workbooks.Add("");
CSVExtractor extractor = new CSVExtractor();
//string outfilePDF1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".csv"));
string outfilePDFExcel1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".xls"));
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
string folderName = @"C:\Users\Dafina\Desktop\PDF_EditProject\PDF_EditProject\PDFs";
string pathString = System.IO.Path.Combine(folderName, System.IO.Path.GetFileName(fileNames).Replace(".pdf", "")) + "-CSVs";
System.IO.Directory.CreateDirectory(pathString);
for (int i = 0; i < pageCount; i++)
{
string outfilePDF = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ext);
extractor.LoadDocumentFromFile(outfilePDF);
//string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
// (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ".csv");
string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "-CSVs\\" + "Sheet_" + (i + 1).ToString()) + ".csv");
extractor.SaveCSVToFile(outfile);
}
Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();
if (xlApp == null)
{
Console.WriteLine("Excel is not properly installed!!");
return;
}
Excel.Workbook xlWorkBook;
object misValue = System.Reflection.Missing.Value;
xlWorkBook = xlApp.Workbooks.Add(misValue);
string[] cvsFiles = Directory.GetFiles(pathString);
Array.Sort(cvsFiles, new AlphanumComparatorFast());
//string[] lista = new string[pageCount];
//for (int t = 0; t < pageCount; t++)
//{
// lista[t] = cvsFiles[t];
//}
//Array.Sort(lista, new AlphanumComparatorFast());
Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
for (int i = 0; i < cvsFiles.Length; i++)
{
int sheet = i + 1;
xlWorkSheet = xlWorkBook.Sheets[sheet];
if (i < cvsFiles.Length - 1)
{
xlWorkBook.Worksheets.Add(Type.Missing, xlWorkSheet, Type.Missing, Type.Missing);
}
int sheetRow = 1;
Encoding objEncoding = Encoding.Default;
StreamReader readerd = new StreamReader(File.OpenRead(cvsFiles[i]));
int ColumLength = 0;
while (!readerd.EndOfStream)
{
string line = readerd.ReadLine();
Console.WriteLine(line);
try
{
string[] columns = line.Split((new char[] { '\"' }));
for (int col = 0; col < columns.Length; col++)
{
if (ColumLength < columns.Length)
{
ColumLength = columns.Length;
}
if (col % 2 == 0)
{
}
else if (columns[col] == "")
{
}
else
{
xlWorkSheet.Cells[sheetRow, col + 1] = columns[col].Replace("\"", "");
}
}
sheetRow++;
}
catch (Exception e)
{
string msg = e.Message;
}
}
int k = 1;
for (int s = 1; s <= ColumLength; s++)
{
xlWorkSheet.Columns[k].Delete();
k++;
}
releaseObject(xlWorkSheet);
readerd.Close();
}
xlWorkBook.SaveAs(outfilePDFExcel1, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal,
misValue, misValue, misValue, misValue, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive,
misValue, misValue, misValue, misValue, misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
releaseObject(xlWorkBook);
releaseObject(xlApp);
var dir = new DirectoryInfo(pathString);
dir.Attributes = dir.Attributes & ~FileAttributes.ReadOnly;
dir.Delete(true);
}