//Crawl.cpp
#include "Crawl.h"
using namespace std;
CCrawl::CCrawl()
{
}
CCrawl::~CCrawl()
{
}
size_t CCrawl::WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
if(ptr == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
// exit(1);
return 0;
}
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
int CCrawl::fetch(string strUrl, char **fileBuf , size_t &imgSize)
{
CURL *curl;
CURLcode res;
struct MemoryStruct chunk;
struct MemoryStruct DataChunk;
chunk.memory = (char *)malloc(1); /* will be grown as needed by the realloc above */
chunk.size = 0; /* no data at this point */
DataChunk.memory = (char *)malloc(1); /* will be grown as needed by the realloc above */
DataChunk.size = 0; /* no data at this point */
// curl_global_init(CURL_GLOBAL_ALL);
/* init the curl session */
curl = curl_easy_init();
/* specify URL to get */
curl_easy_setopt(curl, CURLOPT_URL, strUrl.c_str());
/* complete within 20 seconds */
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);
/* send all data to this function */
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
/* we pass our 'chunk' struct to the callback function */
curl_easy_setopt(curl, CURLOPT_WRITEHEADER, (void *)&chunk);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&DataChunk);
/* some servers don't like requests that are made without a user-agent field, so we provide one */
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0");
/* get it! */
res = curl_easy_perform(curl);
/* check for errors */
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
return 1;
}else{
/*
* Now, our chunk.memory points to a memory block that is chunk.size
* bytes big and contains the remote file.
*/
// printf("\n%lu bytes retrieved ;\n\n\n", (unsigned long)chunk.size);
// printf("%s \n\n\n Finished ...\n\n\n", chunk.memory);
// printf("%s \n\n\n\n\n Finished ...\n", (unsigned long)DataChunk.memory);
// *fileBuf = chunk.memory;
imgSize = DataChunk.size;
*fileBuf = DataChunk.memory;
}
/* cleanup curl stuff */
curl_easy_cleanup(curl);
//BUG free(chunk.memory);
// free(DataChunk.memory);
/* we're done with libcurl, so clean it up */
// curl_global_cleanup();
return 0;
}
//See more: https://blog.51cto.com/fengyuzaitu/2434920
//_________________________________________________
size_t CCrawl::WriteFile(void *ptr, size_t size, size_t nmemb, void *stream)
{
std::ofstream* ofs = (std::ofstream*)stream;
size_t nLen = size * nmemb;
ofs->write((char*)ptr, nLen);
return nLen;
}
void CCrawl::TestStorePhotoFileFromUrl(std::string strUrl)
{
std::ofstream ofs;
ofs.open("img.jpg", std::ios::out | std::ios::binary);
std::string strPhotoBuffer;
CURL *pCurlHandle;
pCurlHandle = curl_easy_init();
curl_easy_setopt(pCurlHandle, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(pCurlHandle, CURLOPT_WRITEDATA, &ofs);
curl_easy_setopt(pCurlHandle, CURLOPT_WRITEFUNCTION, WriteFile);
CURLcode nCurlRet = curl_easy_perform(pCurlHandle);
if ((nCurlRet != CURLE_OK) && (nCurlRet != CURLE_WRITE_ERROR)){
std::cout << "通过LibCurl获取:" << strUrl << "图片失败,错误码是:" << nCurlRet;
}
ofs.close();
curl_easy_cleanup(pCurlHandle);
}
//store to memory
size_t CCrawl::WriteBuffer(void *ptr, size_t size, size_t nmemb, void *stream)
{
std::string* pStrBuffer = (std::string*)stream;
size_t nLen = size * nmemb;
pStrBuffer->append((char*)ptr, nLen);
return nLen;
}
void CCrawl::TestStoreBufferFromUrl(std::string strUrl)
{
std::string strPhotoBuffer;
CURL *pCurlHandle;
pCurlHandle = curl_easy_init();
curl_easy_setopt(pCurlHandle, CURLOPT_URL, strUrl.c_str());
curl_easy_setopt(pCurlHandle, CURLOPT_WRITEDATA, &strPhotoBuffer);
curl_easy_setopt(pCurlHandle, CURLOPT_WRITEFUNCTION, WriteBuffer);
CURLcode nCurlRet = curl_easy_perform(pCurlHandle);
if ((nCurlRet != CURLE_OK) && (nCurlRet != CURLE_WRITE_ERROR)){
std::cout << "通过LibCurl获取:" << strUrl << "图片失败,错误码是:" << nCurlRet;
}else{
std::ofstream ofs;
ofs.open("img2.jpg", std::ios::out | std::ios::binary);
ofs << strPhotoBuffer;
ofs.close();
}
curl_easy_cleanup(pCurlHandle);
}
int CCrawl::AddUrl(string InputFile)
{
string strUrl;
// open the seed url file
ifstream ifsSeed(InputFile.c_str());
if (!ifsSeed){
return 1;
}
string::size_type idx;
for(int i=0;i<1000;i++){
if ( !getline(ifsSeed,strUrl) )
break;
if(((idx = strUrl.find("Root:")) != string::npos)){
continue;
}
setImgUrl.insert(strUrl);
}
ifsSeed.close();
return 0;
}
//Crawl.h
#ifndef _Crawl_H_191220_
#define _Crawl_H_191220_
#include <string>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <unistd.h>
#include <string>
#include <set>
#include <string.h>
#include <algorithm>
#include <sstream>
#include <sys/io.h>
#include <fcntl.h>
#include <sys/wait.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/sem.h>
#include <sys/msg.h>
#include <errno.h>
#include <signal.h>
#include <curl/curl.h>
using namespace std;
class CCrawl
{
public:
string HeaderInf;
public:
CCrawl();
~CCrawl();
int fetch(string strUrl, char **fileBuf, size_t &nmemb );
static void TestStorePhotoFileFromUrl(std::string strUrl);
static void TestStoreBufferFromUrl(std::string strUrl);
private:
set<string> setImgUrl;
struct MemoryStruct {
char *memory;
size_t size;
};
private:
int AddUrl(string InputFile);
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp);
static size_t save_header(void *ptr, size_t size, size_t nmemb, FILE *fp);
static size_t WriteFile(void *ptr, size_t size, size_t nmemb, void *stream);
static size_t WriteBuffer(void *ptr, size_t size, size_t nmemb, void *stream);
};
#endif
//micSky.cpp
#include <string>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "Crawl.h"
//https://curl.haxx.se/libcurl/c/getinmemory.html
int main(int arc, char* arv[])
{
size_t imgSize ;
char *fileHead = NULL;
CCrawl iCrawl;
string strUrl;
curl_global_init(CURL_GLOBAL_ALL);
// open the seed url file
ifstream ifsSeed("imgUrl");
if (!ifsSeed){
exit(1);
}
string::size_type idx;
static long Init = 90000;
long int i=0;
for(;;i++){//each time read a group of lines.
if ( !getline(ifsSeed,strUrl) )
break;
if(i<Init) continue;
if(((idx = strUrl.find("Root:")) != string::npos)){
if( i> Init + 1000) break;
continue;
}
string dom;
string::size_type idx;
if( (idx = strUrl.rfind(".")) != string::npos ){
dom = strUrl.substr(idx);
}else // impossible .
cerr << "Error 1; " << endl;
int bet = iCrawl.fetch(strUrl, &fileHead, imgSize );
if(imgSize<500) continue;
if(bet!=1){
char food[128];
sprintf(food,"Img%d%s",i,dom.c_str());
/***
std::ofstream ofs;
ofs.open(food, std::ios::out | std::ios::binary);
ofs << fileHead;
ofs.close();
//*/
//*
FILE *fp;
fp=fopen(food,"wb");
if(!fp){
// printf("/Spider/imgDown Error: can not open the file.\n");
cerr << "Error 2; can not open the file.." << strUrl << endl;
exit(1);
}
int ret=fwrite(fileHead,imgSize,1,fp);
if(ret!=1){
cerr << "Error 3; can not write the pixel data.. " << i << " ). " << strUrl << endl;
}
fclose(fp);
//*/
}else
cerr << "Error 4; can not fetch the net page.. " << i << " ). " << strUrl << endl;
cout << i << " ). " << strUrl << " imgSize : "<< imgSize <<endl;
if (fileHead){
free(fileHead); fileHead=NULL;
imgSize = 0;
}
}//_for
ifsSeed.close();
curl_global_cleanup();
exit(0);
iCrawl.TestStorePhotoFileFromUrl(strUrl);
iCrawl.TestStoreBufferFromUrl(strUrl);
exit(0);
}
#CMakeList.txt
cmake_minimum_required(VERSION 2.8)
project( Sky )
find_package(CURL REQUIRED)
include_directories(${CURL_INCLUDE_DIR})
aux_source_directory(. SRC_LIST)
add_executable(${PROJECT_NAME} ${SRC_LIST})
target_link_libraries(${PROJECT_NAME} ${CURL_LIBRARY})
include(CheckCXXCompilerFlag)
CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11)
CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X)
if(COMPILER_SUPPORTS_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
elseif(COMPILER_SUPPORTS_CXX0X)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
else()
message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
endif()
The result is satisfactory …
来源:CSDN
作者:szliug1958
链接:https://blog.csdn.net/szliug1958/article/details/103791961