Using itextsharp (or any c# pdf library), how to open a PDF, replace some text, and save it again?

后端 未结 3 422
孤城傲影
孤城傲影 2021-02-04 16:09

Using itextsharp (or any c# pdf library), i need to open a PDF, replace some placeholder text with actual values, and return it as a byte[].

Can someone suggest how to d

相关标签:
3条回答
  • 2021-02-04 16:17

    Unfortunately I was looking for something similar and could not figure it out. Below was about as far as I got, maybe you can use this as a starting point. The problem is that PDF does not actually save text, but instead uses lookup tables and some other arcane wizardry. This method reads the byte-values for the page and attempts to convert to string, but as far as I can tell it can only do English and misses on some special characters, so I gave up my project and moved on.

    string contents = string.Empty();
    Document doc = new Document();
    PdfReader reader = new PdfReader("pathToPdf.pdf");
    using (MemoryStream memoryStream = new MemoryStream())
    {
    
        PdfWriter writer = PdfWriter.GetInstance(doc, memoryStream);
        doc.Open();
        PdfContentByte cb = writer.DirectContent;
        for (int p = 1; p <= reader.NumberOfPages; p++)
        {
            // add page from reader
            doc.SetPageSize(reader.GetPageSize(p));
            doc.NewPage();
    
            // pickup here something like this:
            byte[] bt = reader.GetPageContent(p);
            contents = ExtractTextFromPDFBytes(bt);
    
            if (contents.IndexOf("something")!=-1)
            {
                // make your own pdf page and add to cb (contentbyte)
    
            }
            else
            {
                PdfImportedPage page = writer.GetImportedPage(reader, p);
                int rot = reader.GetPageRotation(p);
                if (rot == 90 || rot == 270)
                    cb.AddTemplate(page, 0, -1.0F, 1.0F, 0, 0, reader.GetPageSizeWithRotation(p).Height);
                else
                    cb.AddTemplate(page, 1.0F, 0, 0, 1.0F, 0, 0);
            }
        }
        reader.Close();
        doc.Close();
        File.WriteAllBytes("pathToOutputOrSamePathToOverwrite.pdf", memoryStream.ToArray());
    

    This is taken from this site.

    private string ExtractTextFromPDFBytes(byte[] input) 
    { 
        if (input == null || input.Length == 0) return ""; 
    
         try 
         { 
             string resultString = ""; 
    
             // Flag showing if we are we currently inside a text object 
             bool inTextObject = false; 
    
             // Flag showing if the next character is literal  
             // e.g. '\\' to get a '\' character or '\(' to get '(' 
             bool nextLiteral = false; 
    
             // () Bracket nesting level. Text appears inside () 
             int bracketDepth = 0; 
    
             // Keep previous chars to get extract numbers etc.: 
             char[] previousCharacters = new char[_numberOfCharsToKeep]; 
             for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; 
    
    
              for (int i = 0; i < input.Length; i++) 
              { 
                  char c = (char)input[i]; 
    
                  if (inTextObject) 
                  { 
                      // Position the text 
                      if (bracketDepth == 0) 
                      { 
                          if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) 
                          { 
                              resultString += "\n\r"; 
                          } 
                          else 
                          { 
                              if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) 
                              { 
                                   resultString += "\n"; 
                               } 
                               else 
                               { 
                                   if (CheckToken(new string[] { "Tj" }, previousCharacters)) 
                                    { 
                                        resultString += " "; 
                                    } 
                                } 
                            } 
                        }
    
                        // End of a text object, also go to a new line. 
                        if (bracketDepth == 0 && 
                            CheckToken(new string[] { "ET" }, previousCharacters)) 
                        { 
    
                            inTextObject = false; 
                            resultString += " "; 
                       } 
                       else 
                       { 
                            // Start outputting text 
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) 
                            { 
                                bracketDepth = 1; 
                            } 
                            else 
                            { 
                                // Stop outputting text 
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) 
                                { 
                                     bracketDepth = 0; 
                                } 
                                else 
                                { 
                                    // Just a normal text character: 
                                    if (bracketDepth == 1) 
                                    { 
                                        // Only print out next character no matter what.  
                                        // Do not interpret. 
                                        if (c == '\\' && !nextLiteral) 
                                        { 
                                            nextLiteral = true; 
                                        } 
                                        else 
                                        { 
                                            if (((c >= ' ') && (c <= '~')) || 
                                                ((c >= 128) && (c < 255))) 
                                            { 
                                                resultString += c.ToString(); 
                                            } 
    
                                            nextLiteral = false; 
                                        } 
                                    } 
                                } 
                            } 
                        } 
                    } 
    
                    // Store the recent characters for  
                    // when we have to go back for a checking 
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++) 
                    { 
                        previousCharacters[j] = previousCharacters[j + 1]; 
                    } 
                    previousCharacters[_numberOfCharsToKeep - 1] = c; 
    
                    // Start of a text object 
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) 
                    { 
                        inTextObject = true; 
                    } 
                } 
            return resultString; 
        } 
        catch 
        { 
            return ""; 
         } 
    } 
    
     private bool CheckToken(string[] tokens, char[] recent) 
     { 
         foreach (string token in tokens) 
         { 
             if ((recent[_numberOfCharsToKeep - 3] == token[0]) && 
               (recent[_numberOfCharsToKeep - 2] == token[1]) && 
               ((recent[_numberOfCharsToKeep - 1] == ' ') || 
               (recent[_numberOfCharsToKeep - 1] == 0x0d) || 
               (recent[_numberOfCharsToKeep - 1] == 0x0a)) && 
               ((recent[_numberOfCharsToKeep - 4] == ' ') || 
               (recent[_numberOfCharsToKeep - 4] == 0x0d) || 
               (recent[_numberOfCharsToKeep - 4] == 0x0a))) 
               { 
                   return true; 
               } 
        }
        return false; 
    } 
    
    0 讨论(0)
  • 2021-02-04 16:37

    In the end, i used PDFescape to open my existing PDF file, and place some form fields in where i need to put my fields, then save it again to create my PDF file.

    http://www.pdfescape.com

    Then i found this blog entry about how to replace form fields:

    http://www.johnnycode.com/blog/2010/03/05/using-a-template-to-programmatically-create-pdfs-with-c-and-itextsharp/

    All works nicely! Here's the code:

    public static byte[] Generate()
    {
      var templatePath = HttpContext.Current.Server.MapPath("~/my_template.pdf");
    
      // Based on:
      // http://www.johnnycode.com/blog/2010/03/05/using-a-template-to-programmatically-create-pdfs-with-c-and-itextsharp/
      var reader = new PdfReader(templatePath);
      var outStream = new MemoryStream();
      var stamper = new PdfStamper(reader, outStream);
    
      var form = stamper.AcroFields;
      var fieldKeys = form.Fields.Keys;
    
      foreach (string fieldKey in fieldKeys)
      {
        if (form.GetField(fieldKey) == "MyTemplatesOriginalTextFieldA")
          form.SetField(fieldKey, "1234");
        if (form.GetField(fieldKey) == "MyTemplatesOriginalTextFieldB")
          form.SetField(fieldKey, "5678");
      }
    
      // "Flatten" the form so it wont be editable/usable anymore  
      stamper.FormFlattening = true;  
    
      stamper.Close();
      reader.Close();
    
      return outStream.ToArray();
    }
    
    0 讨论(0)
  • 2021-02-04 16:40

    I have a python script here that replaces some text in a PDF:

    import re
    import sys
    import zlib
    
    # Module to find and replace text in PDF files
    #
    # Usage:
    #   python pdf_replace.py <input_filename> <text_to_find> <text_to_replace> <output_filename>
    #
    # @author Ionox0
    
    input_filename = sys.argv[1]
    text_to_find = sys.argv[2]
    text_to_replace = sys.argv[3]
    output_filename sys.argv[4]
    
    pdf = open(input_filename, "rb").read()
    
    # Create a copy of the PDF content to make edits to
    pdf_copy = pdf[0:]
    
    # Search for stream objects with text to replace
    stream = re.compile(r'.*?FlateDecode.*?stream(.*?)endstream', re.S)
    
    for s in stream.findall(pdf):
        s = s.strip('\r\n')
    
        try:
            text = zlib.decompress(s)
    
            if text_to_find in text:
                print('Found match:')
                print(text)
    
                text = text.replace(text_to_find, text_to_replace)
                pdf_copy = pdf_copy.replace(s, zlib.compress(text))
        except:
            pass
    
    with open(output_filename, 'wb') as out:
        out.write(pdf_copy)
    
    0 讨论(0)
提交回复
热议问题