Extract hostname name from string

后端 未结 28 1497
情歌与酒
情歌与酒 2020-11-22 07:15

I would like to match just the root of a URL and not the whole URL from a text string. Given:

http://www.youtube.co         


        
相关标签:
28条回答
  • 2020-11-22 07:52

    parse-domain - a very solid lightweight library

    npm install parse-domain

    const { fromUrl, parseDomain } = require("parse-domain");
    

    Example 1

    parseDomain(fromUrl("http://www.example.com/12xy45"))
    
    { type: 'LISTED',
      hostname: 'www.example.com',
      labels: [ 'www', 'example', 'com' ],
      icann:
       { subDomains: [ 'www' ],
         domain: 'example',
         topLevelDomains: [ 'com' ] },
      subDomains: [ 'www' ],
      domain: 'example',
      topLevelDomains: [ 'com' ] }
    

    Example 2

    parseDomain(fromUrl("http://subsub.sub.test.ExAmPlE.coM/12xy45"))
    
    { type: 'LISTED',
      hostname: 'subsub.sub.test.example.com',
      labels: [ 'subsub', 'sub', 'test', 'example', 'com' ],
      icann:
       { subDomains: [ 'subsub', 'sub', 'test' ],
         domain: 'example',
         topLevelDomains: [ 'com' ] },
      subDomains: [ 'subsub', 'sub', 'test' ],
      domain: 'example',
      topLevelDomains: [ 'com' ] }
    

    Why?

    Depending on the use case and volume I strongly recommend against solving this problem yourself using regex or other string manipulation means. The core of this problem is that you need to know all the gtld and cctld suffixes to properly parse url strings into domain and subdomains, these suffixes are regularly updated. This is a solved problem and not one you want to solve yourself (unless you are google or something). Unless you need the hostname or domain name in a pinch don't try and parse your way out of this one.

    0 讨论(0)
  • 2020-11-22 07:54

    Try this:

    var matches = url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i);
    var domain = matches && matches[1];  // domain will be null if no match is found
    

    If you want to exclude the port from your result, use this expression instead:

    /^https?\:\/\/([^\/:?#]+)(?:[\/:?#]|$)/i
    

    Edit: To prevent specific domains from matching, use a negative lookahead. (?!youtube.com)

    /^https?\:\/\/(?!(?:www\.)?(?:youtube\.com|youtu\.be))([^\/:?#]+)(?:[\/:?#]|$)/i
    
    0 讨论(0)
  • 2020-11-22 07:54

    Parsing a URL can be tricky because you can have port numbers and special chars. As such, I recommend using something like parseUri to do this for you. I doubt performance is going to be a issue unless you are parsing hundreds of URLs.

    0 讨论(0)
  • 2020-11-22 07:54

    Well, doing using an regular expression will be a lot easier:

        mainUrl = "http://www.mywebsite.com/mypath/to/folder";
        urlParts = /^(?:\w+\:\/\/)?([^\/]+)(.*)$/.exec(mainUrl);
        host = Fragment[1]; // www.mywebsite.com
    
    0 讨论(0)
  • 2020-11-22 07:57

    I recommend using the npm package psl (Public Suffix List). The "Public Suffix List" is a list of all valid domain suffixes and rules, not just Country Code Top-Level domains, but unicode characters as well that would be considered the root domain (i.e. www.食狮.公司.cn, b.c.kobe.jp, etc.). Read more about it here.

    Try:

    npm install --save psl
    

    Then with my "extractHostname" implementation run:

    let psl = require('psl');
    let url = 'http://www.youtube.com/watch?v=ClkQA2Lb_iE';
    psl.get(extractHostname(url)); // returns youtube.com
    

    I can't use an npm package, so below only tests extractHostname.

    function extractHostname(url) {
        var hostname;
        //find & remove protocol (http, ftp, etc.) and get hostname
    
        if (url.indexOf("//") > -1) {
            hostname = url.split('/')[2];
        }
        else {
            hostname = url.split('/')[0];
        }
    
        //find & remove port number
        hostname = hostname.split(':')[0];
        //find & remove "?"
        hostname = hostname.split('?')[0];
    
        return hostname;
    }
    
    //test the code
    console.log("== Testing extractHostname: ==");
    console.log(extractHostname("http://www.blog.classroom.me.uk/index.php"));
    console.log(extractHostname("http://www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractHostname("https://www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractHostname("www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractHostname("ftps://ftp.websitename.com/dir/file.txt"));
    console.log(extractHostname("websitename.com:1234/dir/file.txt"));
    console.log(extractHostname("ftps://websitename.com:1234/dir/file.txt"));
    console.log(extractHostname("example.com?param=value"));
    console.log(extractHostname("https://facebook.github.io/jest/"));
    console.log(extractHostname("//youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractHostname("http://localhost:4200/watch?v=ClkQA2Lb_iE"));
    
    // Warning: you can use this function to extract the "root" domain, but it will not be as accurate as using the psl package.
    
    function extractRootDomain(url) {
        var domain = extractHostname(url),
            splitArr = domain.split('.'),
            arrLen = splitArr.length;
    
        //extracting the root domain here
        //if there is a subdomain 
        if (arrLen > 2) {
            domain = splitArr[arrLen - 2] + '.' + splitArr[arrLen - 1];
            //check to see if it's using a Country Code Top Level Domain (ccTLD) (i.e. ".me.uk")
            if (splitArr[arrLen - 2].length == 2 && splitArr[arrLen - 1].length == 2) {
                //this is using a ccTLD
                domain = splitArr[arrLen - 3] + '.' + domain;
            }
        }
        return domain;
    }
    
    //test extractRootDomain
    console.log("== Testing extractRootDomain: ==");
    console.log(extractRootDomain("http://www.blog.classroom.me.uk/index.php"));
    console.log(extractRootDomain("http://www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractRootDomain("https://www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractRootDomain("www.youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractRootDomain("ftps://ftp.websitename.com/dir/file.txt"));
    console.log(extractRootDomain("websitename.co.uk:1234/dir/file.txt"));
    console.log(extractRootDomain("ftps://websitename.com:1234/dir/file.txt"));
    console.log(extractRootDomain("example.com?param=value"));
    console.log(extractRootDomain("https://facebook.github.io/jest/"));
    console.log(extractRootDomain("//youtube.com/watch?v=ClkQA2Lb_iE"));
    console.log(extractRootDomain("http://localhost:4200/watch?v=ClkQA2Lb_iE"));

    Regardless having the protocol or even port number, you can extract the domain. This is a very simplified, non-regex solution, so I think this will do.

    *Thank you @Timmerz, @renoirb, @rineez, @BigDong, @ra00l, @ILikeBeansTacos, @CharlesRobertson for your suggestions! @ross-allen, thank you for reporting the bug!

    0 讨论(0)
  • 2020-11-22 07:58

    oneline with jquery

    $('<a>').attr('href', document.location.href).prop('hostname');
    
    0 讨论(0)
提交回复
热议问题