I\'ve been looking for a simple regex for URLs, does anybody have one handy that works well? I didn\'t find one with the zend framework validation classes and have seen sev
And there is your answer =) Try to break it, you can't!!!
function link_validate_url($text) {
$LINK_DOMAINS = 'aero|arpa|asia|biz|com|cat|coop|edu|gov|info|int|jobs|mil|museum|name|nato|net|org|pro|travel|mobi|local';
$LINK_ICHARS_DOMAIN = (string) html_entity_decode(implode("", array( // @TODO completing letters ...
"æ", // æ
"Æ", // Æ
"À", // À
"à", // à
"Á", // Á
"á", // á
"Â", // Â
"â", // â
"å", // å
"Å", // Å
"ä", // ä
"Ä", // Ä
"Ç", // Ç
"ç", // ç
"Ð", // Ð
"ð", // ð
"È", // È
"è", // è
"É", // É
"é", // é
"Ê", // Ê
"ê", // ê
"Ë", // Ë
"ë", // ë
"Î", // Î
"î", // î
"Ï", // Ï
"ï", // ï
"ø", // ø
"Ø", // Ø
"ö", // ö
"Ö", // Ö
"Ô", // Ô
"ô", // ô
"Õ", // Õ
"õ", // õ
"Œ", // Œ
"œ", // œ
"ü", // ü
"Ü", // Ü
"Ù", // Ù
"ù", // ù
"Û", // Û
"û", // û
"Ÿ", // Ÿ
"ÿ", // ÿ
"Ñ", // Ñ
"ñ", // ñ
"þ", // þ
"Þ", // Þ
"ý", // ý
"Ý", // Ý
"¿", // ¿
)), ENT_QUOTES, 'UTF-8');
$LINK_ICHARS = $LINK_ICHARS_DOMAIN . (string) html_entity_decode(implode("", array(
"ß", // ß
)), ENT_QUOTES, 'UTF-8');
$allowed_protocols = array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'webcal');
// Starting a parenthesis group with (?: means that it is grouped, but is not captured
$protocol = '((?:'. implode("|", $allowed_protocols) .'):\/\/)';
$authentication = "(?:(?:(?:[\w\.\-\+!$&'\(\)*\+,;=" . $LINK_ICHARS . "]|%[0-9a-f]{2})+(?::(?:[\w". $LINK_ICHARS ."\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})*)?)?@)";
$domain = '(?:(?:[a-z0-9' . $LINK_ICHARS_DOMAIN . ']([a-z0-9'. $LINK_ICHARS_DOMAIN . '\-_\[\]])*)(\.(([a-z0-9' . $LINK_ICHARS_DOMAIN . '\-_\[\]])+\.)*('. $LINK_DOMAINS .'|[a-z]{2}))?)';
$ipv4 = '(?:[0-9]{1,3}(\.[0-9]{1,3}){3})';
$ipv6 = '(?:[0-9a-fA-F]{1,4}(\:[0-9a-fA-F]{1,4}){7})';
$port = '(?::([0-9]{1,5}))';
// Pattern specific to external links.
$external_pattern = '/^'. $protocol .'?'. $authentication .'?('. $domain .'|'. $ipv4 .'|'. $ipv6 .' |localhost)'. $port .'?';
// Pattern specific to internal links.
$internal_pattern = "/^(?:[a-z0-9". $LINK_ICHARS ."_\-+\[\]]+)";
$internal_pattern_file = "/^(?:[a-z0-9". $LINK_ICHARS ."_\-+\[\]\.]+)$/i";
$directories = "(?:\/[a-z0-9". $LINK_ICHARS ."_\-\.~+%=&,$'#!():;*@\[\]]*)*";
// Yes, four backslashes == a single backslash.
$query = "(?:\/?\?([?a-z0-9". $LINK_ICHARS ."+_|\-\.~\/\\\\%=&,$'():;*@\[\]{} ]*))";
$anchor = "(?:#[a-z0-9". $LINK_ICHARS ."_\-\.~+%=&,$'():;*@\[\]\/\?]*)";
// The rest of the path for a standard URL.
$end = $directories .'?'. $query .'?'. $anchor .'?'.'$/i';
$message_id = '[^@].*@'. $domain;
$newsgroup_name = '(?:[0-9a-z+-]*\.)*[0-9a-z+-]*';
$news_pattern = '/^news:('. $newsgroup_name .'|'. $message_id .')$/i';
$user = '[a-zA-Z0-9'. $LINK_ICHARS .'_\-\.\+\^!#\$%&*+\/\=\?\`\|\{\}~\'\[\]]+';
$email_pattern = '/^mailto:'. $user .'@'.'(?:'. $domain .'|'. $ipv4 .'|'. $ipv6 .'|localhost)'. $query .'?$/';
if (strpos($text, '') === 0) {
return false;
}
if (in_array('mailto', $allowed_protocols) && preg_match($email_pattern, $text)) {
return false;
}
if (in_array('news', $allowed_protocols) && preg_match($news_pattern, $text)) {
return false;
}
if (preg_match($internal_pattern . $end, $text)) {
return false;
}
if (preg_match($external_pattern . $end, $text)) {
return false;
}
if (preg_match($internal_pattern_file, $text)) {
return false;
}
return true;
}