Parsing robot.txt using java and identify whether an url is allowed

后端 未结 2 1130
小蘑菇
小蘑菇 2021-01-06 08:24

I am currently using jsoup in a application to parse and analyses web pages.But I want to make sure that I am adhere to the robot.txt rules and only visit pages which are al

2条回答
  •  星月不相逢
    2021-01-06 09:06

    The above didn't work for me. I took managed to put this together. First time I'm doing Java in 4 years, so I'm sure this can be improved.

    public static boolean robotSafe(URL url) 
    {
        String strHost = url.getHost();
    
        String strRobot = "http://" + strHost + "/robots.txt";
        URL urlRobot;
        try { urlRobot = new URL(strRobot);
        } catch (MalformedURLException e) {
            // something weird is happening, so don't trust it
            return false;
        }
    
        String strCommands;
        try 
        {
            InputStream urlRobotStream = urlRobot.openStream();
            byte b[] = new byte[1000];
            int numRead = urlRobotStream.read(b);
            strCommands = new String(b, 0, numRead);
            while (numRead != -1) {
                numRead = urlRobotStream.read(b);
                if (numRead != -1) 
                {
                        String newCommands = new String(b, 0, numRead);
                        strCommands += newCommands;
                }
            }
           urlRobotStream.close();
        } 
        catch (IOException e) 
        {
            return true; // if there is no robots.txt file, it is OK to search
        }
    
        if (strCommands.contains(DISALLOW)) // if there are no "disallow" values, then they are not blocking anything.
        {
            String[] split = strCommands.split("\n");
            ArrayList robotRules = new ArrayList<>();
            String mostRecentUserAgent = null;
            for (int i = 0; i < split.length; i++) 
            {
                String line = split[i].trim();
                if (line.toLowerCase().startsWith("user-agent")) 
                {
                    int start = line.indexOf(":") + 1;
                    int end   = line.length();
                    mostRecentUserAgent = line.substring(start, end).trim();
                }
                else if (line.startsWith(DISALLOW)) {
                    if (mostRecentUserAgent != null) {
                        RobotRule r = new RobotRule();
                        r.userAgent = mostRecentUserAgent;
                        int start = line.indexOf(":") + 1;
                        int end   = line.length();
                        r.rule = line.substring(start, end).trim();
                        robotRules.add(r);
                    }
                }
            }
    
            for (RobotRule robotRule : robotRules)
            {
                String path = url.getPath();
                if (robotRule.rule.length() == 0) return true; // allows everything if BLANK
                if (robotRule.rule == "/") return false;       // allows nothing if /
    
                if (robotRule.rule.length() <= path.length())
                { 
                    String pathCompare = path.substring(0, robotRule.rule.length());
                    if (pathCompare.equals(robotRule.rule)) return false;
                }
            }
        }
        return true;
    }
    

    And you will need the helper class:

    /**
     *
     * @author Namhost.com
     */
    public class RobotRule 
    {
        public String userAgent;
        public String rule;
    
        RobotRule() {
    
        }
    
        @Override public String toString() 
        {
            StringBuilder result = new StringBuilder();
            String NEW_LINE = System.getProperty("line.separator");
            result.append(this.getClass().getName() + " Object {" + NEW_LINE);
            result.append("   userAgent: " + this.userAgent + NEW_LINE);
            result.append("   rule: " + this.rule + NEW_LINE);
            result.append("}");
            return result.toString();
        }    
    }
    

提交回复
热议问题