In my bash script I need to extract just the path from the given URL. For example, from the variable containing string:
http://login:password@example.com/one/more/dir/fi
I wrote a function to that will extract any part or the URL. I've only tested it in bash. Usage:
url_parse [url-part]
example:
$ url_parse "http://example.com:8080/home/index.html" path
home/index.html
code:
url_parse() {
local -r url=$1 url_part=$2
#define url tokens and url regular expression
local -r protocol='^[^:]+' user='[^:@]+' password='[^@]+' host='[^:/?#]+' \
port='[0-9]+' path='\/([^?#]*)' query='\?([^#]+)' fragment='#(.*)'
local -r auth="($user)(:($password))?@"
local -r connection="($auth)?($host)(:($port))?"
local -r url_regex="($protocol):\/\/($connection)?($path)?($query)?($fragment)?$"
#parse url and create an array
IFS=',' read -r -a url_arr <<< $(echo $url | awk -v OFS=, \
"{match(\$0,/$url_regex/,a);print a[1],a[4],a[6],a[7],a[9],a[11],a[13],a[15]}")
[[ ${url_arr[0]} ]] || { echo "Invalid URL: $url" >&2 ; return 1 ; }
case $url_part in
protocol) echo ${url_arr[0]} ;;
auth) echo ${url_arr[1]}:${url_arr[2]} ;; # ex: john.doe:1234
user) echo ${url_arr[1]} ;;
password) echo ${url_arr[2]} ;;
host-port)echo ${url_arr[3]}:${url_arr[4]} ;; #ex: example.com:8080
host) echo ${url_arr[3]} ;;
port) echo ${url_arr[4]} ;;
path) echo ${url_arr[5]} ;;
query) echo ${url_arr[6]} ;;
fragment) echo ${url_arr[7]} ;;
info) echo -e "protocol:${url_arr[0]}\nuser:${url_arr[1]}\npassword:${url_arr[2]}\nhost:${url_arr[3]}\nport:${url_arr[4]}\npath:${url_arr[5]}\nquery:${url_arr[6]}\nfragment:${url_arr[7]}";;
"") ;; # used to validate url
*) echo "Invalid URL part: $url_part" >&2 ; return 1 ;;
esac
}