I am currently using jsoup in a application to parse and analyses web pages.But I want to make sure that I am adhere to the robot.txt rules and only visit pages which are al
The above didn't work for me. I took managed to put this together. First time I'm doing Java in 4 years, so I'm sure this can be improved.
public static boolean robotSafe(URL url)
{
String strHost = url.getHost();
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try { urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don't trust it
return false;
}
String strCommands;
try
{
InputStream urlRobotStream = urlRobot.openStream();
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
numRead = urlRobotStream.read(b);
if (numRead != -1)
{
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
}
catch (IOException e)
{
return true; // if there is no robots.txt file, it is OK to search
}
if (strCommands.contains(DISALLOW)) // if there are no "disallow" values, then they are not blocking anything.
{
String[] split = strCommands.split("\n");
ArrayList robotRules = new ArrayList<>();
String mostRecentUserAgent = null;
for (int i = 0; i < split.length; i++)
{
String line = split[i].trim();
if (line.toLowerCase().startsWith("user-agent"))
{
int start = line.indexOf(":") + 1;
int end = line.length();
mostRecentUserAgent = line.substring(start, end).trim();
}
else if (line.startsWith(DISALLOW)) {
if (mostRecentUserAgent != null) {
RobotRule r = new RobotRule();
r.userAgent = mostRecentUserAgent;
int start = line.indexOf(":") + 1;
int end = line.length();
r.rule = line.substring(start, end).trim();
robotRules.add(r);
}
}
}
for (RobotRule robotRule : robotRules)
{
String path = url.getPath();
if (robotRule.rule.length() == 0) return true; // allows everything if BLANK
if (robotRule.rule == "/") return false; // allows nothing if /
if (robotRule.rule.length() <= path.length())
{
String pathCompare = path.substring(0, robotRule.rule.length());
if (pathCompare.equals(robotRule.rule)) return false;
}
}
}
return true;
}
And you will need the helper class:
/**
*
* @author Namhost.com
*/
public class RobotRule
{
public String userAgent;
public String rule;
RobotRule() {
}
@Override public String toString()
{
StringBuilder result = new StringBuilder();
String NEW_LINE = System.getProperty("line.separator");
result.append(this.getClass().getName() + " Object {" + NEW_LINE);
result.append(" userAgent: " + this.userAgent + NEW_LINE);
result.append(" rule: " + this.rule + NEW_LINE);
result.append("}");
return result.toString();
}
}