Using itextsharp (or any c# pdf library), how to open a PDF, replace some text, and save it again?

后端 未结 3 418
孤城傲影
孤城傲影 2021-02-04 16:09

Using itextsharp (or any c# pdf library), i need to open a PDF, replace some placeholder text with actual values, and return it as a byte[].

Can someone suggest how to d

3条回答
  •  粉色の甜心
    2021-02-04 16:17

    Unfortunately I was looking for something similar and could not figure it out. Below was about as far as I got, maybe you can use this as a starting point. The problem is that PDF does not actually save text, but instead uses lookup tables and some other arcane wizardry. This method reads the byte-values for the page and attempts to convert to string, but as far as I can tell it can only do English and misses on some special characters, so I gave up my project and moved on.

    string contents = string.Empty();
    Document doc = new Document();
    PdfReader reader = new PdfReader("pathToPdf.pdf");
    using (MemoryStream memoryStream = new MemoryStream())
    {
    
        PdfWriter writer = PdfWriter.GetInstance(doc, memoryStream);
        doc.Open();
        PdfContentByte cb = writer.DirectContent;
        for (int p = 1; p <= reader.NumberOfPages; p++)
        {
            // add page from reader
            doc.SetPageSize(reader.GetPageSize(p));
            doc.NewPage();
    
            // pickup here something like this:
            byte[] bt = reader.GetPageContent(p);
            contents = ExtractTextFromPDFBytes(bt);
    
            if (contents.IndexOf("something")!=-1)
            {
                // make your own pdf page and add to cb (contentbyte)
    
            }
            else
            {
                PdfImportedPage page = writer.GetImportedPage(reader, p);
                int rot = reader.GetPageRotation(p);
                if (rot == 90 || rot == 270)
                    cb.AddTemplate(page, 0, -1.0F, 1.0F, 0, 0, reader.GetPageSizeWithRotation(p).Height);
                else
                    cb.AddTemplate(page, 1.0F, 0, 0, 1.0F, 0, 0);
            }
        }
        reader.Close();
        doc.Close();
        File.WriteAllBytes("pathToOutputOrSamePathToOverwrite.pdf", memoryStream.ToArray());
    

    This is taken from this site.

    private string ExtractTextFromPDFBytes(byte[] input) 
    { 
        if (input == null || input.Length == 0) return ""; 
    
         try 
         { 
             string resultString = ""; 
    
             // Flag showing if we are we currently inside a text object 
             bool inTextObject = false; 
    
             // Flag showing if the next character is literal  
             // e.g. '\\' to get a '\' character or '\(' to get '(' 
             bool nextLiteral = false; 
    
             // () Bracket nesting level. Text appears inside () 
             int bracketDepth = 0; 
    
             // Keep previous chars to get extract numbers etc.: 
             char[] previousCharacters = new char[_numberOfCharsToKeep]; 
             for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; 
    
    
              for (int i = 0; i < input.Length; i++) 
              { 
                  char c = (char)input[i]; 
    
                  if (inTextObject) 
                  { 
                      // Position the text 
                      if (bracketDepth == 0) 
                      { 
                          if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) 
                          { 
                              resultString += "\n\r"; 
                          } 
                          else 
                          { 
                              if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) 
                              { 
                                   resultString += "\n"; 
                               } 
                               else 
                               { 
                                   if (CheckToken(new string[] { "Tj" }, previousCharacters)) 
                                    { 
                                        resultString += " "; 
                                    } 
                                } 
                            } 
                        }
    
                        // End of a text object, also go to a new line. 
                        if (bracketDepth == 0 && 
                            CheckToken(new string[] { "ET" }, previousCharacters)) 
                        { 
    
                            inTextObject = false; 
                            resultString += " "; 
                       } 
                       else 
                       { 
                            // Start outputting text 
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) 
                            { 
                                bracketDepth = 1; 
                            } 
                            else 
                            { 
                                // Stop outputting text 
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) 
                                { 
                                     bracketDepth = 0; 
                                } 
                                else 
                                { 
                                    // Just a normal text character: 
                                    if (bracketDepth == 1) 
                                    { 
                                        // Only print out next character no matter what.  
                                        // Do not interpret. 
                                        if (c == '\\' && !nextLiteral) 
                                        { 
                                            nextLiteral = true; 
                                        } 
                                        else 
                                        { 
                                            if (((c >= ' ') && (c <= '~')) || 
                                                ((c >= 128) && (c < 255))) 
                                            { 
                                                resultString += c.ToString(); 
                                            } 
    
                                            nextLiteral = false; 
                                        } 
                                    } 
                                } 
                            } 
                        } 
                    } 
    
                    // Store the recent characters for  
                    // when we have to go back for a checking 
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++) 
                    { 
                        previousCharacters[j] = previousCharacters[j + 1]; 
                    } 
                    previousCharacters[_numberOfCharsToKeep - 1] = c; 
    
                    // Start of a text object 
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) 
                    { 
                        inTextObject = true; 
                    } 
                } 
            return resultString; 
        } 
        catch 
        { 
            return ""; 
         } 
    } 
    
     private bool CheckToken(string[] tokens, char[] recent) 
     { 
         foreach (string token in tokens) 
         { 
             if ((recent[_numberOfCharsToKeep - 3] == token[0]) && 
               (recent[_numberOfCharsToKeep - 2] == token[1]) && 
               ((recent[_numberOfCharsToKeep - 1] == ' ') || 
               (recent[_numberOfCharsToKeep - 1] == 0x0d) || 
               (recent[_numberOfCharsToKeep - 1] == 0x0a)) && 
               ((recent[_numberOfCharsToKeep - 4] == ' ') || 
               (recent[_numberOfCharsToKeep - 4] == 0x0d) || 
               (recent[_numberOfCharsToKeep - 4] == 0x0a))) 
               { 
                   return true; 
               } 
        }
        return false; 
    } 
    

提交回复
热议问题