Detect URLs in text with JavaScript

后端 未结 13 2004
孤城傲影
孤城傲影 2020-11-22 06:23

Does anyone have suggestions for detecting URLs in a set of strings?

arrayOfStrings.forEach(function(string){
  // detect URLs in strings and do something sw         


        
相关标签:
13条回答
  • 2020-11-22 06:58

    If you want to detect links with http:// OR without http:// OR ftp OR other possible cases like removing trailing punctuation at the end, take a look at this code.

    https://jsfiddle.net/AndrewKang/xtfjn8g3/

    A simple way to use that is to use NPM

    npm install --save url-knife
    
    0 讨论(0)
  • 2020-11-22 06:59

    tmp.innerText is undefined. You should use tmp.innerHTML

    function strip(html) 
        {  
            var tmp = document.createElement("DIV"); 
            tmp.innerHTML = html; 
            var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;   
            return tmp.innerHTML .replace(urlRegex, function(url) {     
            return '\n' + url 
        })
    
    0 讨论(0)
  • 2020-11-22 07:00

    Function can be further improved to render images as well:

    function renderHTML(text) { 
        var rawText = strip(text)
        var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;   
    
        return rawText.replace(urlRegex, function(url) {   
    
        if ( ( url.indexOf(".jpg") > 0 ) || ( url.indexOf(".png") > 0 ) || ( url.indexOf(".gif") > 0 ) ) {
                return '<img src="' + url + '">' + '<br/>'
            } else {
                return '<a href="' + url + '">' + url + '</a>' + '<br/>'
            }
        }) 
    } 
    

    or for a thumbnail image that links to fiull size image:

    return '<a href="' + url + '"><img style="width: 100px; border: 0px; -moz-border-radius: 5px; border-radius: 5px;" src="' + url + '">' + '</a>' + '<br/>'
    

    And here is the strip() function that pre-processes the text string for uniformity by removing any existing html.

    function strip(html) 
        {  
            var tmp = document.createElement("DIV"); 
            tmp.innerHTML = html; 
            var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;   
            return tmp.innerText.replace(urlRegex, function(url) {     
            return '\n' + url 
        })
    } 
    
    0 讨论(0)
  • 2020-11-22 07:05

    I googled this problem for quite a while, then it occurred to me that there is an Android method, android.text.util.Linkify, that utilizes some pretty robust regexes to accomplish this. Luckily, Android is open source.

    They use a few different patterns for matching different types of urls. You can find them all here: http://grepcode.com/file/repository.grepcode.com/java/ext/com.google.android/android/2.0_r1/android/text/util/Regex.java#Regex.0WEB_URL_PATTERN

    If you're just concerned about url's that match the WEB_URL_PATTERN, that is, urls that conform to the RFC 1738 spec, you can use this:

    /((?:(http|https|Http|Https|rtsp|Rtsp):\/\/(?:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,64}(?:\:(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,25})?\@)?)?((?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]{0,64}\.)+(?:(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])|(?:biz|b[abdefghijmnorstvwyz])|(?:cat|com|coop|c[acdfghiklmnoruvxyz])|d[ejkmoz]|(?:edu|e[cegrstu])|f[ijkmor]|(?:gov|g[abdefghilmnpqrstuwy])|h[kmnrtu]|(?:info|int|i[delmnoqrst])|(?:jobs|j[emop])|k[eghimnrwyz]|l[abcikrstuvy]|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])|(?:name|net|n[acefgilopruz])|(?:org|om)|(?:pro|p[aefghklmnrstwy])|qa|r[eouw]|s[abcdeghijklmnortuvyz]|(?:tel|travel|t[cdfghjklmnoprtvwz])|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]))|(?:(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])))(?:\:\d{1,5})?)(\/(?:(?:[a-zA-Z0-9\;\/\?\:\@\&\=\#\~\-\.\+\!\*\'\(\)\,\_])|(?:\%[a-fA-F0-9]{2}))*)?(?:\b|$)/gi;
    

    Here is the full text of the source:

    "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
    + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
    + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
    + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
    + "(?:"   // plus top level domain
    + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
    + "|(?:biz|b[abdefghijmnorstvwyz])"
    + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
    + "|d[ejkmoz]"
    + "|(?:edu|e[cegrstu])"
    + "|f[ijkmor]"
    + "|(?:gov|g[abdefghilmnpqrstuwy])"
    + "|h[kmnrtu]"
    + "|(?:info|int|i[delmnoqrst])"
    + "|(?:jobs|j[emop])"
    + "|k[eghimnrwyz]"
    + "|l[abcikrstuvy]"
    + "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])"
    + "|(?:name|net|n[acefgilopruz])"
    + "|(?:org|om)"
    + "|(?:pro|p[aefghklmnrstwy])"
    + "|qa"
    + "|r[eouw]"
    + "|s[abcdeghijklmnortuvyz]"
    + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
    + "|u[agkmsyz]"
    + "|v[aceginu]"
    + "|w[fs]"
    + "|y[etu]"
    + "|z[amw]))"
    + "|(?:(?:25[0-5]|2[0-4]" // or ip address
    + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
    + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
    + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
    + "|[1-9][0-9]|[0-9])))"
    + "(?:\\:\\d{1,5})?)" // plus option port number
    + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
    + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
    + "(?:\\b|$)";
    

    If you want to be really fancy, you can test for email addresses as well. The regex for email addresses is:

    /[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}\\@[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}(\\.[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25})+/gi
    

    PS: The top level domains supported by above regex are current as of June 2007. For an up to date list you'll need to check https://data.iana.org/TLD/tlds-alpha-by-domain.txt.

    0 讨论(0)
  • 2020-11-22 07:07

    First you need a good regex that matches urls. This is hard to do. See here, here and here:

    ...almost anything is a valid URL. There are some punctuation rules for splitting it up. Absent any punctuation, you still have a valid URL.

    Check the RFC carefully and see if you can construct an "invalid" URL. The rules are very flexible.

    For example ::::: is a valid URL. The path is ":::::". A pretty stupid filename, but a valid filename.

    Also, ///// is a valid URL. The netloc ("hostname") is "". The path is "///". Again, stupid. Also valid. This URL normalizes to "///" which is the equivalent.

    Something like "bad://///worse/////" is perfectly valid. Dumb but valid.

    Anyway, this answer is not meant to give you the best regex but rather a proof of how to do the string wrapping inside the text, with JavaScript.

    OK so lets just use this one: /(https?:\/\/[^\s]+)/g

    Again, this is a bad regex. It will have many false positives. However it's good enough for this example.

    function urlify(text) {
      var urlRegex = /(https?:\/\/[^\s]+)/g;
      return text.replace(urlRegex, function(url) {
        return '<a href="' + url + '">' + url + '</a>';
      })
      // or alternatively
      // return text.replace(urlRegex, '<a href="$1">$1</a>')
    }
    
    var text = 'Find me at http://www.example.com and also at http://stackoverflow.com';
    var html = urlify(text);
    
    console.log(html)

    // html now looks like:
    // "Find me at <a href="http://www.example.com">http://www.example.com</a> and also at <a href="http://stackoverflow.com">http://stackoverflow.com</a>"
    

    So in sum try:

    $$('#pad dl dd').each(function(element) {
        element.innerHTML = urlify(element.innerHTML);
    });
    
    0 讨论(0)
  • 2020-11-22 07:07

    Generic Object Oriented Solution

    For people like me that use frameworks like angular that don't allow manipulating DOM directly, I created a function that takes a string and returns an array of url/plainText objects that can be used to create any UI representation that you want.

    URL regex

    For URL matching I used (slightly adapted) h0mayun regex: /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g

    My function also drops punctuation characters from the end of a URL like . and , that I believe more often will be actual punctuation than a legit URL ending (but it could be! This is not rigorous science as other answers explain well) For that I apply the following regex onto matched URLs /^(.+?)([.,?!'"]*)$/.

    Typescript code

        export function urlMatcherInText(inputString: string): UrlMatcherResult[] {
            if (! inputString) return [];
    
            const results: UrlMatcherResult[] = [];
    
            function addText(text: string) {
                if (! text) return;
    
                const result = new UrlMatcherResult();
                result.type = 'text';
                result.value = text;
                results.push(result);
            }
    
            function addUrl(url: string) {
                if (! url) return;
    
                const result = new UrlMatcherResult();
                result.type = 'url';
                result.value = url;
                results.push(result);
            }
    
            const findUrlRegex = /(?:(?:https?:\/\/)|(?:www\.))[^\s]+/g;
            const cleanUrlRegex = /^(.+?)([.,?!'"]*)$/;
    
            let match: RegExpExecArray;
            let indexOfStartOfString = 0;
    
            do {
                match = findUrlRegex.exec(inputString);
    
                if (match) {
                    const text = inputString.substr(indexOfStartOfString, match.index - indexOfStartOfString);
                    addText(text);
    
                    var dirtyUrl = match[0];
                    var urlDirtyMatch = cleanUrlRegex.exec(dirtyUrl);
                    addUrl(urlDirtyMatch[1]);
                    addText(urlDirtyMatch[2]);
    
                    indexOfStartOfString = match.index + dirtyUrl.length;
                }
            }
            while (match);
    
            const remainingText = inputString.substr(indexOfStartOfString, inputString.length - indexOfStartOfString);
            addText(remainingText);
    
            return results;
        }
    
        export class UrlMatcherResult {
            public type: 'url' | 'text'
            public value: string
        }
    
    0 讨论(0)
提交回复
热议问题