I need to scrape this HTML page ...
https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115
....
I think the following ought to help a little - you will need to tweak the XPath query perhaps to target a specific table and thus specific cell contents but the main code seems to work ok. The problem I suspect with the original code was that the url is https
which usually requires additional configuration settings when making a curl request. There are settings in the curlrequest
function which could be removed, I just copied from another script where I had these set.
Change the path to $cacert
to a copy of cacert.pem
on your system or to the live version on curl.haxx.se
$url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115';
function _curlrequest( $url=null, $options=null ){
$cacert='c:/wwwroot/cacert.pem';
$vbh = fopen('php://temp', 'w+');
$res=array(
'response' => null,
'verbose' => null,
'info' => array( 'http_code' => 100 ),
'headers' => null,
'errors' => null
);
if( is_null( $url ) ) return (object)$res;
session_write_close();
/* Initialise curl request object */
$curl=curl_init();
if( parse_url( $url,PHP_URL_SCHEME )=='https' ){
curl_setopt( $curl, CURLOPT_SSL_VERIFYPEER, true );
curl_setopt( $curl, CURLOPT_SSL_VERIFYHOST, 2 );
curl_setopt( $curl, CURLOPT_CAINFO, $cacert );
}
/* Define standard options */
curl_setopt( $curl, CURLOPT_URL,trim( $url ) );
curl_setopt( $curl, CURLOPT_AUTOREFERER, true );
curl_setopt( $curl, CURLOPT_FOLLOWLOCATION, true );
curl_setopt( $curl, CURLOPT_FAILONERROR, true );
curl_setopt( $curl, CURLOPT_HEADER, false );
curl_setopt( $curl, CURLINFO_HEADER_OUT, false );
curl_setopt( $curl, CURLOPT_RETURNTRANSFER, true );
curl_setopt( $curl, CURLOPT_BINARYTRANSFER, true );
curl_setopt( $curl, CURLOPT_CONNECTTIMEOUT, 20 );
curl_setopt( $curl, CURLOPT_TIMEOUT, 60 );
curl_setopt( $curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' );
curl_setopt( $curl, CURLOPT_MAXREDIRS, 10 );
curl_setopt( $curl, CURLOPT_ENCODING, '' );
curl_setopt( $curl,CURLOPT_VERBOSE,true );
curl_setopt( $curl,CURLOPT_NOPROGRESS,true );
curl_setopt( $curl,CURLOPT_STDERR,$vbh );
/* Assign runtime parameters as options */
if( isset( $options ) && is_array( $options ) ){
foreach( $options as $param => $value ) curl_setopt( $curl, $param, $value );
}
/* Execute the request and store responses */
$res=(object)array(
'response' => curl_exec( $curl ),
'info' => (object)curl_getinfo( $curl ),
'errors' => curl_error( $curl )
);
rewind( $vbh );
$res->verbose=stream_get_contents( $vbh );
fclose( $vbh );
curl_close( $curl );
return $res;
}
function getdom( $data=false, $debug=false ){
try{
if( !$data )throw new Exception('No data passed whilst trying to invoke DOMDocument');
libxml_use_internal_errors( true );
$dom = new DOMDocument();
$dom->validateOnParse=false;
$dom->standalone=true;
$dom->strictErrorChecking=false;
$dom->recover=true;
$dom->formatOutput=false;
$dom->loadHTML( $data );
$errors=libxml_get_errors();
libxml_clear_errors();
return !empty( $errors ) && $debug ? $errors : $dom;
}catch( Exception $e ){
echo $e->getMessage();
}
}
$obj=_curlrequest( $url );
if( $obj->info->http_code==200 ){
$dom=getdom( $obj->response );
$xp=new DOMXPath( $dom );
$query='//div[ contains( @class,"cRiga3 boxtriageS" ) ]';
$col=$xp->query( $query );
if( !empty( $col ) && $col->length > 0 ){
foreach( $col as $node )echo $node->nodeValue . '
';
}
}
This outputs
2
20
37
>1h
1
2
24
10
5
7
32
29
0
3
25
5
0
0
6
2