Get All Links in a Document

后端 未结 7 1292
余生分开走
余生分开走 2020-12-01 08:33

Given a \"normal document\" in Google Docs/Drive (e.g. paragraphs, lists, tables) which contains external links scattered throughout the content, how do you compile a list o

相关标签:
7条回答
  • 2020-12-01 08:56

    I offer another, shorter answer for your first question, concerning iterating through all links in a document's body. This instructive code returns a flat array of links in the current document's body, where each link is represented by an object with entries pointing to the text element (text), the paragraph element or list item element in which it's contained (paragraph), the offset index in the text where the link appears (startOffset) and the URL itself (url). Hopefully, you'll find it easy to suit it for your own needs.

    It uses the getTextAttributeIndices() method rather than iterating over every character of the text, and is thus expected to perform much more quickly than previously written answers.

    EDIT: Since originally posting this answer, I modified the function a couple of times. It now also (1) includes the endOffsetInclusive property for each link (note that it can be null for links that extend to the end of the text element - in this case one can use link.text.length-1 instead); (2) finds links in all sections of the document, not only the body, and (3) includes the section and isFirstPageSection properties to indicate where the link is located; (4) accepts the argument mergeAdjacent, which when set to true, will return only a single link entry for a continuous stretch of text linked to the same URL (which would be considered separate if, for instance, part of the text is styled differently than another part).

    For the purpose of including links under all sections, a new utility function, iterateSections(), was introduced.

    /**
     * Returns a flat array of links which appear in the active document's body. 
     * Each link is represented by a simple Javascript object with the following 
     * keys:
     *   - "section": {ContainerElement} the document section in which the link is
     *     found. 
     *   - "isFirstPageSection": {Boolean} whether the given section is a first-page
     *     header/footer section.
     *   - "paragraph": {ContainerElement} contains a reference to the Paragraph 
     *     or ListItem element in which the link is found.
     *   - "text": the Text element in which the link is found.
     *   - "startOffset": {Number} the position (offset) in the link text begins.
     *   - "endOffsetInclusive": the position of the last character of the link
     *      text, or null if the link extends to the end of the text element.
     *   - "url": the URL of the link.
     *
     * @param {boolean} mergeAdjacent Whether consecutive links which carry 
     *     different attributes (for any reason) should be returned as a single 
     *     entry.
     * 
     * @returns {Array} the aforementioned flat array of links.
     */
    function getAllLinks(mergeAdjacent) {
      var links = [];
    
      var doc = DocumentApp.getActiveDocument();
    
    
      iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
        if (!("getParagraphs" in section)) {
          // as we're using some undocumented API, adding this to avoid cryptic
          // messages upon possible API changes.
          throw new Error("An API change has caused this script to stop " + 
                          "working.\n" +
                          "Section #" + sectionIndex + " of type " + 
                          section.getType() + " has no .getParagraphs() method. " +
            "Stopping script.");
        }
    
        section.getParagraphs().forEach(function(par) { 
          // skip empty paragraphs
          if (par.getNumChildren() == 0) {
            return;
          }
    
          // go over all text elements in paragraph / list-item
          for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
            if (el.getType() != DocumentApp.ElementType.TEXT) {
              continue;
            }
    
            // go over all styling segments in text element
            var attributeIndices = el.getTextAttributeIndices();
            var lastLink = null;
            attributeIndices.forEach(function(startOffset, i, attributeIndices) { 
              var url = el.getLinkUrl(startOffset);
    
              if (url != null) {
                // we hit a link
                var endOffsetInclusive = (i+1 < attributeIndices.length? 
                                          attributeIndices[i+1]-1 : null);
    
                // check if this and the last found link are continuous
                if (mergeAdjacent && lastLink != null && lastLink.url == url && 
                      lastLink.endOffsetInclusive == startOffset - 1) {
                  // this and the previous style segment are continuous
                  lastLink.endOffsetInclusive = endOffsetInclusive;
                  return;
                }
    
                lastLink = {
                  "section": section,
                  "isFirstPageSection": isFirstPageSection,
                  "paragraph": par,
                  "textEl": el,
                  "startOffset": startOffset,
                  "endOffsetInclusive": endOffsetInclusive,
                  "url": url
                };
    
                links.push(lastLink);
              }        
            });
          }
        });
      });
    
    
      return links;
    }
    
    /**
     * Calls the given function for each section of the document (body, header, 
     * etc.). Sections are children of the DocumentElement object.
     *
     * @param {Document} doc The Document object (such as the one obtained via
     *     a call to DocumentApp.getActiveDocument()) with the sections to iterate
     *     over.
     * @param {Function} func A callback function which will be called, for each
     *     section, with the following arguments (in order):
     *       - {ContainerElement} section - the section element
     *       - {Number} sectionIndex - the child index of the section, such that
     *         doc.getBody().getParent().getChild(sectionIndex) == section.
     *       - {Boolean} isFirstPageSection - whether the section is a first-page
     *         header/footer section.
     */
    function iterateSections(doc, func) {
      // get the DocumentElement interface to iterate over all sections
      // this bit is undocumented API
      var docEl = doc.getBody().getParent();
    
      var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : 
                                       docEl.getChildIndex(doc.getHeader()));
      var regularFooterSectionIndex = (doc.getFooter() == null? -1 : 
                                       docEl.getChildIndex(doc.getFooter()));
    
      for (var i=0; i<docEl.getNumChildren(); ++i) {
        var section = docEl.getChild(i);
    
        var sectionType = section.getType();
        var uniqueSectionName;
        var isFirstPageSection = (
          i != regularHeaderSectionIndex &&
          i != regularFooterSectionIndex && 
          (sectionType == DocumentApp.ElementType.HEADER_SECTION ||
           sectionType == DocumentApp.ElementType.FOOTER_SECTION));
    
        func(section, i, isFirstPageSection);
      }
    }
    
    0 讨论(0)
  • 2020-12-01 08:57

    You are right ... search and replace is not applicable here. Use setLinkUrl() https://developers.google.com/apps-script/reference/document/container-element#setLinkUrl(String)

    Basically you have to iterate through the elements recursively (elements can contain elements) and for each use getLinkUrl() to get the oldText if not null , setLinkUrl(newText) .... leaves displayed text unchanged

    0 讨论(0)
  • 2020-12-01 09:01

    Here's a quick and dirty way to accomplish the same goal with no scripting:

    1. From Google Docs, save the document in RTF format.
    2. In your editor of choice, edit the links in the RTF file (in my case, I wanted to modify all the hyperlinks, so I used Emacs and regexp-replace). Save the file when you're done.
    3. Create a fresh, new Google Doc, and from the menu, select File>Open and open the RTF file. Docs will convert your edited RTF file back into a proper Google Doc, restoring all formatting.

    Google Docs' RTF format is pretty complete--I haven't noticed any loss of fidelity in making the round trip, and it has the advantage of fully exposing all the hyperlinks, formatting, and everything else about the document in a form that's easy to edit and to apply regex tools to.

    0 讨论(0)
  • 2020-12-01 09:08

    I was playing around and incorporated @Mogsdad's answer -- here's the really complicated version:

    var _ = Underscorejs.load(); // loaded via http://googleappsdeveloper.blogspot.com/2012/11/using-open-source-libraries-in-apps.html, rolled my own
    var ui = DocumentApp.getUi();
    
    // #region --------------------- Utilities -----------------------------
    
    var gDocsHelper = (function(P, un) {
      // heavily based on answer https://stackoverflow.com/a/18731628/1037948
    
      var updatedLinkText = function(link, offset) {
        return function() { return 'Text: ' + link.getText().substring(offset,100) + ((link.getText().length-offset) > 100 ? '...' : ''); }
      }
    
      P.updateLink = function updateLink(link, oldText, newText, start, end) {
        var oldLink = link.getLinkUrl(start);
    
        if(0 > oldLink.indexOf(oldText)) return false;
    
        var newLink = oldLink.replace(new RegExp(oldText, 'g'), newText);
        link.setLinkUrl(start || 0, (end || oldLink.length), newLink);
        log(true, "Updating Link: ", oldLink, newLink, start, end, updatedLinkText(link, start) );
    
        return { old: oldLink, "new": newLink, getText: updatedLinkText(link, start) };
      };
    
      // moving this reused block out to 'private' fn
      var updateLinkResult = function(text, oldText, newText, link, urls, sidebar, updateResult) {
        // and may as well update the link while we're here
        if(false !== (updateResult = P.updateLink(text, oldText, newText, link.start, link.end))) {
           sidebar.append('<li>' + updateResult['old'] + ' &rarr; ' + updateResult['new'] + ' at ' + updateResult['getText']() + '</li>'); 
        }
    
        urls.push(link.url); // so multiple links get added to list
      };
    
      P.updateLinksMenu = function() {
        // https://developers.google.com/apps-script/reference/base/prompt-response
        var oldText = ui.prompt('Old link text to replace').getResponseText();
        var newText = ui.prompt('New link text to replace with').getResponseText();
    
        log('Replacing: ' + oldText + ', ' + newText);
        var sidebar = gDocUiHelper.createSidebar('Update All Links', '<h3>Replacing</h3><p><code>' + oldText + '</code> &rarr; <code>' + newText + '</code></p><hr /><ol>');
    
        // current doc available to script
        var doc = DocumentApp.getActiveDocument().getBody();//.getActiveSection();
    
        // Search until a link is found
        var links = P.findAllElementsFor(doc, function(text) {
          var i = -1, n = text.getText().length, link = false, url, urls = [], updateResult;
    
          // note: the following only gets the FIRST link in the text -- while(i < n && !(url = text.getLinkUrl(i++)));
    
          // scan the text element for links
          while(++i < n) {
    
            // getLinkUrl will continue to get a link while INSIDE the stupid link, so only do this once
            if(url = text.getLinkUrl(i)) {
              if(false === link) {
                link = { start: i, end: -1, url: url };
                // log(true, 'Type: ' + text.getType(), 'Link: ' + url, function() { return 'Text: ' + text.getText().substring(i,100) + ((n-i) > 100 ? '...' : '')});
              }
              else {
                link.end = i; // keep updating the end position until we leave
              }
            }
            // just left the link -- reset link tracking
            else if(false !== link) {
              // and may as well update the link while we're here
              updateLinkResult(text, oldText, newText, link, urls, sidebar);
              link = false; // reset "counter"
            }
    
          }
    
          // once we've reached the end of the text, must also check to see if the last thing we found was a link
          if(false !== link) updateLinkResult(text, oldText, newText, link, urls, sidebar);
    
          return urls;
        });
    
        sidebar.append('</ol><p><strong>' + links.length + ' links reviewed</strong></p>');
        gDocUiHelper.attachSidebar(sidebar);
    
        log(links);
      };
    
      P.findAllElementsFor = function(el, test) {
        // generic utility function to recursively find all elements; heavily based on https://stackoverflow.com/a/18731628/1037948
    
        var results = [], searchResult = null, i, result;
        // https://developers.google.com/apps-script/reference/document/body#findElement(ElementType)
        while (searchResult = el.findElement(DocumentApp.ElementType.TEXT, searchResult)) {
          var t = searchResult.getElement().editAsText(); // .asParagraph()
    
          // check to add to list
          if(test && (result = test(t))) {
            if( _.isArray(result) ) results = results.concat(result); // could be big? http://jsperf.com/self-concatenation/
            else results.push(result);
          }
        }
        // recurse children if not plain text item
        if(el.getType() !== DocumentApp.ElementType.TEXT) {
          i = el.getNumChildren();
    
          var result;
          while(--i > 0) {
            result = P.findAllElementsFor(el.getChild(i));
            if(result && result.length > 0) results = results.concat(result);
          }
        }
    
        return results;
      };
    
      return P;  
    })({});
    
    // really? it can't handle object properties?
    function gDocsUpdateLinksMenu() {
      gDocsHelper.updateLinksMenu();
    }
    
    gDocUiHelper.addMenu('Zaus', [ ['Update links', 'gDocsUpdateLinksMenu'] ]);
    
    // #endregion --------------------- Utilities -----------------------------
    

    And I'm including the "extra" utility classes for creating menus, sidebars, etc below for completeness:

    var log = function() {
      // return false;
    
      var args = Array.prototype.slice.call(arguments);
    
      // allowing functions delegates execution so we can save some non-debug cycles if code left in?
    
      if(args[0] === true) Logger.log(_.map(args, function(v) { return _.isFunction(v) ? v() : v; }).join('; '));
      else
        _.each(args, function(v) {
          Logger.log(_.isFunction(v) ? v() : v);
        });
    }
    
    // #region --------------------- Menu -----------------------------
    
    var gDocUiHelper = (function(P, un) {
    
      P.addMenuToSheet = function addMenu(spreadsheet, title, items) {
        var menu = ui.createMenu(title);
        // make sure menu items are correct format
        _.each(items, function(v,k) {
          var err = [];
    
          // provided in format [ [name, fn],... ] instead
          if( _.isArray(v) ) {
            if ( v.length === 2 ) {
              menu.addItem(v[0], v[1]);
            }
            else {
              err.push('Menu item ' + k + ' missing name or function: ' + v.join(';'))
            }
          }
          else {
            if( !v.name ) err.push('Menu item ' + k + ' lacks name');
            if( !v.functionName ) err.push('Menu item ' + k + ' lacks function');
    
            if(!err.length) menu.addItem(v.name, v.functionName);
          }
    
          if(err.length) {
            log(err);
            ui.alert(err.join('; '));
          }
    
        });
    
        menu.addToUi();
      };
    
      // list of things to hook into
      var initializers = {};
    
      P.addMenu = function(menuTitle, menuItems) {
        if(initializers[menuTitle] === un) {
          initializers[menuTitle] = [];
        }
        initializers[menuTitle] = initializers[menuTitle].concat(menuItems);
      };
    
      P.createSidebar = function(title, content, options) {
        var sidebar = HtmlService
        .createHtmlOutput()
        .setTitle(title)
        .setWidth( (options && options.width) ? width : 350 /* pixels */);
    
        sidebar.append(content);
    
        if(options && options.on) DocumentApp.getUi().showSidebar(sidebar);
        // else { sidebar.attach = function() { DocumentApp.getUi().showSidebar(this); }; } // should really attach to prototype...
    
        return sidebar;
      };
    
      P.attachSidebar = function(sidebar) {
        DocumentApp.getUi().showSidebar(sidebar);
      };
    
    
      P.onOpen = function() {
        var spreadsheet = SpreadsheetApp.getActive();
        log(initializers);
        _.each(initializers, function(v,k) {
          P.addMenuToSheet(spreadsheet, k, v);
        });
      };
    
      return P;
    })({});
    
    // #endregion --------------------- Menu -----------------------------
    
    /**
     * A special function that runs when the spreadsheet is open, used to add a
     * custom menu to the spreadsheet.
     */
    function onOpen() {
      gDocUiHelper.onOpen();
    }
    
    0 讨论(0)
  • 2020-12-01 09:16

    Had some trouble getting Mogsdad's solution to work. Specifically it misses links which end their parent element so there isn't a trailing non-link character to terminate it. I've implemented something which addresses this and returns a standard range element. Sharing here incase someone finds it useful.

    function getAllLinks(element) {
      var rangeBuilder = DocumentApp.getActiveDocument().newRange();
    
      // Parse the text iteratively to find the start and end indices for each link
      if (element.getType() === DocumentApp.ElementType.TEXT) {
        var links = [];
        var string = element.getText();
        var previousUrl = null; // The URL of the previous character 
        var currentLink = null; // The latest link being built
        for (var charIndex = 0; charIndex < string.length; charIndex++) {
          var currentUrl = element.getLinkUrl(charIndex);
          // New URL means create a new link
          if (currentUrl !== null && previousUrl !== currentUrl) {
            if (currentLink !== null) links.push(currentLink);
            currentLink = {};
            currentLink.url = String(currentUrl);
            currentLink.startOffset = charIndex;
          }
          // In a URL means extend the end of the current link
          if (currentUrl !== null) {
            currentLink.endOffsetInclusive = charIndex;
          }
          // Not in a URL means close and push the link if ready
          if (currentUrl === null) {
            if (currentLink !== null) links.push(currentLink);
            currentLink = null;
          }
          // End the loop and go again
          previousUrl = currentUrl;
        }
        // Handle the end case when final character is a link
        if (currentLink !== null) links.push(currentLink);
        // Convert the links into a range before returning
        links.forEach(function(link) {
          rangeBuilder.addElement(element, link.startOffset, link.endOffsetInclusive);
        });
      }
    
      // If not a text element then recursively get links from child elements
      else if (element.getNumChildren) {
        for (var i = 0; i < element.getNumChildren(); i++) {
          rangeBuilder.addRange(getAllLinks(element.getChild(i)));
        }
      }
    
      return rangeBuilder.build();
    }
    
    0 讨论(0)
  • 2020-12-01 09:18

    This is only mostly painful! Code is available as part of a gist.

    ScreenshotYeah, I can't spell.

    getAllLinks

    Here's a utility function that scans the document for all LinkUrls, returning them in an array.

    /**
     * Get an array of all LinkUrls in the document. The function is
     * recursive, and if no element is provided, it will default to
     * the active document's Body element.
     *
     * @param {Element} element The document element to operate on. 
     * .
     * @returns {Array}         Array of objects, vis
     *                              {element,
     *                               startOffset,
     *                               endOffsetInclusive, 
     *                               url}
     */
    function getAllLinks(element) {
      var links = [];
      element = element || DocumentApp.getActiveDocument().getBody();
    
      if (element.getType() === DocumentApp.ElementType.TEXT) {
        var textObj = element.editAsText();
        var text = element.getText();
        var inUrl = false;
        for (var ch=0; ch < text.length; ch++) {
          var url = textObj.getLinkUrl(ch);
          if (url != null) {
            if (!inUrl) {
              // We are now!
              inUrl = true;
              var curUrl = {};
              curUrl.element = element;
              curUrl.url = String( url ); // grab a copy
              curUrl.startOffset = ch;
            }
            else {
              curUrl.endOffsetInclusive = ch;
            }          
          }
          else {
            if (inUrl) {
              // Not any more, we're not.
              inUrl = false;
              links.push(curUrl);  // add to links
              curUrl = {};
            }
          }
        }
      }
      else {
        var numChildren = element.getNumChildren();
        for (var i=0; i<numChildren; i++) {
          links = links.concat(getAllLinks(element.getChild(i)));
        }
      }
    
      return links;
    }
    

    findAndReplaceLinks

    This utility builds on getAllLinks to do a find & replace function.

    /**
     * Replace all or part of UrlLinks in the document.
     *
     * @param {String} searchPattern    the regex pattern to search for 
     * @param {String} replacement      the text to use as replacement
     *
     * @returns {Number}                number of Urls changed 
     */
    function findAndReplaceLinks(searchPattern,replacement) {
      var links = getAllLinks();
      var numChanged = 0;
    
      for (var l=0; l<links.length; l++) {
        var link = links[l];
        if (link.url.match(searchPattern)) {
          // This link needs to be changed
          var newUrl = link.url.replace(searchPattern,replacement);
          link.element.setLinkUrl(link.startOffset, link.endOffsetInclusive, newUrl);
          numChanged++
        }
      }
      return numChanged;
    }
    

    Demo UI

    To demonstrate the use of these utilities, here are a couple of UI extensions:

    function onOpen() {
      // Add a menu with some items, some separators, and a sub-menu.
      DocumentApp.getUi().createMenu('Utils')
          .addItem('List Links', 'sidebarLinks')
          .addItem('Replace Link Text', 'searchReplaceLinks')
          .addToUi();
    }
    
    function searchReplaceLinks() {
      var ui = DocumentApp.getUi();
      var app = UiApp.createApplication()
                     .setWidth(250)
                     .setHeight(100)
                     .setTitle('Change Url text');
      var form = app.createFormPanel();
      var flow = app.createFlowPanel();
      flow.add(app.createLabel("Find: "));
      flow.add(app.createTextBox().setName("searchPattern"));
      flow.add(app.createLabel("Replace: "));
      flow.add(app.createTextBox().setName("replacement"));
      var handler = app.createServerHandler('myClickHandler');
      flow.add(app.createSubmitButton("Submit").addClickHandler(handler));
      form.add(flow);
      app.add(form);
      ui.showDialog(app);
    }
    
    // ClickHandler to close dialog
    function myClickHandler(e) {
      var app = UiApp.getActiveApplication();
    
      app.close();
      return app;
    }
    
    function doPost(e) {
      var numChanged = findAndReplaceLinks(e.parameter.searchPattern,e.parameter.replacement);
      var ui = DocumentApp.getUi();
      var app = UiApp.createApplication();
    
      sidebarLinks(); // Update list
    
      var result = DocumentApp.getUi().alert(
          'Results',
          "Changed "+numChanged+" urls.",
          DocumentApp.getUi().ButtonSet.OK);
    }
    
    
    /**
     * Shows a custom HTML user interface in a sidebar in the Google Docs editor.
     */
    function sidebarLinks() {
      var links = getAllLinks();
      var sidebar = HtmlService
              .createHtmlOutput()
              .setTitle('URL Links')
              .setWidth(350 /* pixels */);
    
      // Display list of links, url only.
      for (var l=0; l<links.length; l++) {
        var link = links[l];
        sidebar.append('<p>'+link.url);
      }
    
      DocumentApp.getUi().showSidebar(sidebar);
    }
    
    0 讨论(0)
提交回复
热议问题