此工程springboot整合webmagic爬虫框架,用mybatis将爬取的数据存入mysql
pom文件
webmagic依赖
<!--webmagic -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
mybatis逆向工程依赖
<!-- mybatis core -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>2.1.0</version>
</dependency>
<!--未经测试 mybatis-generator应写为mybatis-generator-core -->
<dependency>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-core</artifactId>
<version>1.3.5</version>
</dependency>
<dependency>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.5</version>
</dependency>
<!-------------------------------------------------------------------------->
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.5</version>
<!--指定资源文件的路径-->
<configuration>
<configurationFile>src\main\resources\generatorConfig.xml</configurationFile>
<verbose>true</verbose>
<overwrite>true</overwrite>
</configuration>
<!--此插件需要依赖的jar包资源-->
<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.17</version>
</dependency>
</dependencies>
</plugin>
完整pom文件
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--webmagic -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--mysql -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
<!-- mybatis core -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>2.1.0</version>
</dependency>
<!--未经测试 mybatis-generator应写为mybatis-generator-core -->
<dependency>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-core</artifactId>
<version>1.3.5</version>
</dependency>
<dependency>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.5</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.10</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.5</version>
<!--指定资源文件的路径-->
<configuration>
<configurationFile>src\main\resources\generatorConfig.xml</configurationFile>
<verbose>true</verbose>
<overwrite>true</overwrite>
</configuration>
<!--此插件需要依赖的jar包资源-->
<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.17</version>
</dependency>
</dependencies>
</plugin>
</plugins>
</build>
application.properties
server.port=10000
spring.datasource.tomcat.driver-class-name=com.mysql.cj.jdbc.Driver
#mybatis
spring.datasource.url=jdbc:mysql://IP:3306/web_magic?useUnicode=true&characterEncoding=UTF-8&transformedBitIsBoolean=true&autoReconnect=true&failOverReadOnly=false&allowMultiQueries=true&useSSL=false
spring.datasource.username=root
spring.datasource.password=password
#spring.datasource.tomcat.default-auto-commit=true
mybatis.type-aliases-package=com.jatham.pojo
mybatis.mapper-locations=classpath:mapper/*.xml
# 数据库连接池的最小维持连接数
spring.datasource.dbcp2.min-idle=5
# 数据库连接池的最大维持连接数
spring.datasource.dbcp2.max-idle=5
# 数据库连接池的初始化连接数
spring.datasource.dbcp2.initial-size=5
# 数据库连接池等待连接获取最大的超时时间
spring.datasource.dbcp2.max-wait-millis=5000
mybatis逆向工程配置文件 generatorConfig.xml
修改JDBC参数,数据库的表名,mapper地址尽量不要修改,会出现sqlsession无法注入的问题
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE generatorConfiguration
PUBLIC "-//mybatis.org//DTD MyBatis Generator Configuration 1.0//EN"
"http://mybatis.org/dtd/mybatis-generator-config_1_0.dtd">
<generatorConfiguration>
<context id="testTables" targetRuntime="MyBatis3">
<commentGenerator>
<!-- 是否去除自动生成的注释 true:是 : false:否 -->
<property name="suppressAllComments" value="true" />
</commentGenerator>
<!--数据库连接的信息:驱动类、连接地址、用户名、密码 -->
<jdbcConnection driverClass="com.mysql.jdbc.Driver"
connectionURL="jdbc:mysql://IP:3306/web_magic"
userId="root"
password="password">
<property name="nullCatalogMeansCurrent" value="true"/>
</jdbcConnection>
<!-- 默认false,把JDBC DECIMAL 和 NUMERIC 类型解析为 Integer,为 true时把JDBC DECIMAL 和
NUMERIC 类型解析为java.math.BigDecimal -->
<javaTypeResolver>
<property name="forceBigDecimals" value="false" />
</javaTypeResolver>
<!-- targetProject:生成PO类的位置 -->
<javaModelGenerator targetPackage="com.jatham.pojo"
targetProject="src/main/java">
<!-- enableSubPackages:是否让schema作为包的后缀 -->
<property name="enableSubPackages" value="false" />
<!-- 从数据库返回的值被清理前后的空格 -->
<property name="trimStrings" value="true" />
</javaModelGenerator>
<!-- targetProject:mapper映射文件生成的位置 -->
<sqlMapGenerator targetPackage="mapper"
targetProject="src/main/resources">
<!-- enableSubPackages:是否让schema作为包的后缀 -->
<property name="enableSubPackages" value="false" />
</sqlMapGenerator>
<!-- targetPackage:mapper接口生成的位置 -->
<javaClientGenerator type="XMLMAPPER"
targetPackage="com.jatham.mapper"
targetProject="src/main/java">
<!-- enableSubPackages:是否让schema作为包的后缀 -->
<property name="enableSubPackages" value="false" />
</javaClientGenerator>
<!-- 指定数据库表 -->
<table tableName="auth_name" schema="" enableCountByExample="false"
enableDeleteByExample="false" enableUpdateByExample="false"
enableSelectByExample="false" selectByExampleQueryId="false">
</table>
<!-- 有些表的字段需要指定java类型
<table schema="" tableName="">
<columnOverride column="" javaType="" />
</table> -->
</context>
</generatorConfiguration>
mybatis流程
逆向工会创建XXXmapper.xml,XXXmapper.class,XXX实体类.class
编写service,controller测试
@Service
public class AuthNameService {
@Autowired
private AuthNameMapper authNameMapper;
public AuthName selectByPrimaryKey(Integer id){
return authNameMapper.selectByPrimaryKey(id);
}
public int insert(AuthName record){
return authNameMapper.insert(record);
}
public int insertSelective(AuthName record){
return authNameMapper.insertSelective(record);
}
}
@RestController
public class AuthNameController {
@Autowired
private AuthNameService authNameService;
@GetMapping("/getid")
@ResponseBody
public AuthName getById(@RequestParam(value = "id")int id){
return authNameService.selectByPrimaryKey(id);
}
@PostMapping("/insertData")
public int insertData(@RequestParam(value = "id")int id,
@RequestParam(value = "name")String name)
{
return authNameService.insert(new AuthName(id,name));
}
@PostMapping("/insertSelectData")
public int insertSelectiveData(@RequestParam(value = "id")int id,
@RequestParam(value = "name")String name)
{
return authNameService.insertSelective(new AuthName(id,name));
}
}
WebMagic代码
process抓取页面
抓取auth的id和name,并存入List集合中,编写getAuthList()方法
public class MyProcessor implements PageProcessor {
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
public static int count = 0;
List<AuthName> list = new ArrayList<>();
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
AuthName auth = new AuthName();
//判断链接是否符合http://www.cnblogs.com/任意个数字字母-/p/7个数字.html格式
// if(!page.getUrl().regex("http://www.cnblogs.com/[a-z 0-9 -]+/p/[0-9]{7}.html").match()){
//加入满足条件的链接
page.addTargetRequests(
page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
//获取页面需要的内容
// page.putField("test",page.getHtml().xpath("//div[@class=\"para\"]/b/text()"));
page.putField("authname", page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()"));
auth.setId(count);
auth.setName(page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").get());
list.add(auth);
System.out.println("抓取的内容:" +
page.getHtml().css("#Header1_HeaderTitle", "text").get() +
page.getHtml().xpath("//*[@id=\"mainContent\"]/div/a/@name").get() +
page.getHtml().xpath("//*[@id=\"home\"]/div[@id=\"footer\"]/text()").get()
// page.getHtml().xpath("//*[@id=\"cnblogs_post_body\"]/p/strong/text()").get()
// page.getHtml().xpath("//div[@class=\"para\"]/b/text()").get()
);
count++;
}
public List<AuthName> getAuthList() {
return this.list;
}
}
WebMagic启动代码
spiderStart为启动方法,insertData调用mybatis存入mysql
@Component
public class WebMagicAuth {
MyProcessor myProcessor = new MyProcessor();
@Autowired
AuthNameService authNameService;
public static WebMagicAuth webMagicAuth;
@PostConstruct
public void init() {
webMagicAuth = this;
webMagicAuth.authNameService = this.authNameService;
}
public void spiderStart() {
long startTime, endTime;
//MyProcessor myProcessor =new MyProcessor();
System.out.println("开始爬取...");
startTime = System.currentTimeMillis();
Spider.create(myProcessor).addUrl("https://www.cnblogs.com/")
.addPipeline(new ConsolePipeline())
.thread(1)
.run();
endTime = System.currentTimeMillis();
System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+MyProcessor.count+"条记录");
System.out.println("asd"+myProcessor.getAuthList().get(2).getName());
}
public void insertData()
{
spiderStart();
List<AuthName> authNames = myProcessor.getAuthList();
for (AuthName authName:
myProcessor.getAuthList()) {
System.out.println(authName.toString());
if(authName.getId() == null){
}else {
webMagicAuth.authNameService.insertSelective(authName);
}
}
}
}
遇见错误
mybatis There is no getter for property named ‘name
’ in ‘class com.jatham.pojo.AuthName’
这行报错只需要检查XXmapper.xml文件,遇到保留字段,加个反引号,esc下面的
意为无法自动装载service
添加代码
@Autowired
AuthNameService authNameService;
public static WebMagicAuth webMagicAuth;
@PostConstruct
public void init() {
webMagicAuth = this;
webMagicAuth.authNameService = this.authNameService;
}
最后主方法调用
@SpringBootApplication
@MapperScan("com.jatham.mapper")
public class BootWebmagicApplication {
public static void main(String[] args) {
SpringApplication.run(BootWebmagicApplication.class, args);
new WebMagicAuth().insertData();
}
}
最终效果
mybatis逆向工程再生成实体类时会会出现this.name = name == null ? null : name.trim();
这是去null,但是我们再抓取的时候会出现null,然后报错,所以把它舍去,改为this.name = name
附录
docker容器化部署
1.maven封装jar包
2.编写dockerfile
3.docker build -t image-name .
4.docker run
参考链接
来源:CSDN
作者:Jatham_C
链接:https://blog.csdn.net/qq_42784606/article/details/104629517