springboot+webmagic+mysql

亡梦爱人 提交于 2020-03-03 19:31:16

此工程springboot整合webmagic爬虫框架,用mybatis将爬取的数据存入mysql

pom文件

webmagic依赖

<!--webmagic -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

mybatis逆向工程依赖

<!-- mybatis core -->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>2.1.0</version>
        </dependency>
        <!--未经测试  mybatis-generator应写为mybatis-generator-core -->
        <dependency>
            <groupId>org.mybatis.generator</groupId>
            <artifactId>mybatis-generator-core</artifactId>
            <version>1.3.5</version>
        </dependency>
        <dependency>
            <groupId>org.mybatis.generator</groupId>
            <artifactId>mybatis-generator-maven-plugin</artifactId>
            <version>1.3.5</version>
        </dependency>
<!-------------------------------------------------------------------------->
<plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
            <plugin>
                <groupId>org.mybatis.generator</groupId>
                <artifactId>mybatis-generator-maven-plugin</artifactId>
                <version>1.3.5</version>
                <!--指定资源文件的路径-->
                <configuration>
                    <configurationFile>src\main\resources\generatorConfig.xml</configurationFile>
                    <verbose>true</verbose>
                    <overwrite>true</overwrite>
                </configuration>
                <!--此插件需要依赖的jar包资源-->
                <dependencies>
                    <dependency>
                        <groupId>mysql</groupId>
                        <artifactId>mysql-connector-java</artifactId>
                        <version>8.0.17</version>
                    </dependency>
                </dependencies>
            </plugin>

完整pom文件

<dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--webmagic -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!--mysql -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.15</version>
        </dependency>
        <!-- mybatis core -->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>2.1.0</version>
        </dependency>
        <!--未经测试  mybatis-generator应写为mybatis-generator-core -->
        <dependency>
            <groupId>org.mybatis.generator</groupId>
            <artifactId>mybatis-generator-core</artifactId>
            <version>1.3.5</version>
        </dependency>
        <dependency>
            <groupId>org.mybatis.generator</groupId>
            <artifactId>mybatis-generator-maven-plugin</artifactId>
            <version>1.3.5</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.junit.vintage</groupId>
                    <artifactId>junit-vintage-engine</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.10</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
            <plugin>
                <groupId>org.mybatis.generator</groupId>
                <artifactId>mybatis-generator-maven-plugin</artifactId>
                <version>1.3.5</version>
                <!--指定资源文件的路径-->
                <configuration>
                    <configurationFile>src\main\resources\generatorConfig.xml</configurationFile>
                    <verbose>true</verbose>
                    <overwrite>true</overwrite>
                </configuration>
                <!--此插件需要依赖的jar包资源-->
                <dependencies>
                    <dependency>
                        <groupId>mysql</groupId>
                        <artifactId>mysql-connector-java</artifactId>
                        <version>8.0.17</version>
                    </dependency>
                </dependencies>
            </plugin>
        </plugins>
    </build>

application.properties

server.port=10000
spring.datasource.tomcat.driver-class-name=com.mysql.cj.jdbc.Driver

#mybatis
spring.datasource.url=jdbc:mysql://IP:3306/web_magic?useUnicode=true&characterEncoding=UTF-8&transformedBitIsBoolean=true&autoReconnect=true&failOverReadOnly=false&allowMultiQueries=true&useSSL=false
spring.datasource.username=root
spring.datasource.password=password
#spring.datasource.tomcat.default-auto-commit=true
mybatis.type-aliases-package=com.jatham.pojo
mybatis.mapper-locations=classpath:mapper/*.xml


# 数据库连接池的最小维持连接数
spring.datasource.dbcp2.min-idle=5

# 数据库连接池的最大维持连接数
spring.datasource.dbcp2.max-idle=5

# 数据库连接池的初始化连接数
spring.datasource.dbcp2.initial-size=5

# 数据库连接池等待连接获取最大的超时时间
spring.datasource.dbcp2.max-wait-millis=5000

mybatis逆向工程配置文件 generatorConfig.xml

修改JDBC参数,数据库的表名,mapper地址尽量不要修改,会出现sqlsession无法注入的问题

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE generatorConfiguration
        PUBLIC "-//mybatis.org//DTD MyBatis Generator Configuration 1.0//EN"
        "http://mybatis.org/dtd/mybatis-generator-config_1_0.dtd">

<generatorConfiguration>
    <context id="testTables" targetRuntime="MyBatis3">
        <commentGenerator>
            <!-- 是否去除自动生成的注释 true:是 : false:否 -->
            <property name="suppressAllComments" value="true" />
        </commentGenerator>
        <!--数据库连接的信息:驱动类、连接地址、用户名、密码 -->
        <jdbcConnection driverClass="com.mysql.jdbc.Driver"
                        connectionURL="jdbc:mysql://IP:3306/web_magic"
                        userId="root"
                        password="password">
            <property name="nullCatalogMeansCurrent" value="true"/>
        </jdbcConnection>

        <!-- 默认false,把JDBC DECIMAL 和 NUMERIC 类型解析为 Integer,为 true时把JDBC DECIMAL 和
            NUMERIC 类型解析为java.math.BigDecimal -->
        <javaTypeResolver>
            <property name="forceBigDecimals" value="false" />
        </javaTypeResolver>

        <!-- targetProject:生成PO类的位置 -->
        <javaModelGenerator targetPackage="com.jatham.pojo"
                            targetProject="src/main/java">
            <!-- enableSubPackages:是否让schema作为包的后缀 -->
            <property name="enableSubPackages" value="false" />
            <!-- 从数据库返回的值被清理前后的空格 -->
            <property name="trimStrings" value="true" />
        </javaModelGenerator>
        <!-- targetProject:mapper映射文件生成的位置 -->
        <sqlMapGenerator targetPackage="mapper"
                         targetProject="src/main/resources">
            <!-- enableSubPackages:是否让schema作为包的后缀 -->
            <property name="enableSubPackages" value="false" />
        </sqlMapGenerator>
        <!-- targetPackage:mapper接口生成的位置 -->
        <javaClientGenerator type="XMLMAPPER"
                             targetPackage="com.jatham.mapper"
                             targetProject="src/main/java">
            <!-- enableSubPackages:是否让schema作为包的后缀 -->
            <property name="enableSubPackages" value="false" />
        </javaClientGenerator>

        <!-- 指定数据库表 -->
        <table tableName="auth_name" schema="" enableCountByExample="false"
               enableDeleteByExample="false" enableUpdateByExample="false"
               enableSelectByExample="false" selectByExampleQueryId="false">
        </table>


        <!-- 有些表的字段需要指定java类型
         <table schema="" tableName="">
            <columnOverride column="" javaType="" />
        </table> -->
    </context>
</generatorConfiguration>

mybatis流程

逆向工会创建XXXmapper.xml,XXXmapper.class,XXX实体类.class

编写service,controller测试

@Service
public class AuthNameService {
    @Autowired
    private AuthNameMapper authNameMapper;

    public AuthName selectByPrimaryKey(Integer id){
        return authNameMapper.selectByPrimaryKey(id);
    }
    public int insert(AuthName record){
        return authNameMapper.insert(record);
    }
    public int insertSelective(AuthName record){
        return authNameMapper.insertSelective(record);
    }
}
@RestController
public class AuthNameController {

    @Autowired
    private AuthNameService authNameService;

    @GetMapping("/getid")
    @ResponseBody
    public AuthName getById(@RequestParam(value = "id")int id){
        return authNameService.selectByPrimaryKey(id);
    }

    @PostMapping("/insertData")
    public int insertData(@RequestParam(value = "id")int id,
                             @RequestParam(value = "name")String name)
    {
        return  authNameService.insert(new AuthName(id,name));

    }
    @PostMapping("/insertSelectData")
    public int insertSelectiveData(@RequestParam(value = "id")int id,
                          @RequestParam(value = "name")String name)
    {
        return  authNameService.insertSelective(new AuthName(id,name));

    }
}

WebMagic代码

process抓取页面
抓取auth的id和name,并存入List集合中,编写getAuthList()方法

public class MyProcessor implements PageProcessor {

    // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    public static int count = 0;
    List<AuthName> list = new ArrayList<>();

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        AuthName auth = new AuthName();
        //判断链接是否符合http://www.cnblogs.com/任意个数字字母-/p/7个数字.html格式
        // if(!page.getUrl().regex("http://www.cnblogs.com/[a-z 0-9 -]+/p/[0-9]{7}.html").match()){
        //加入满足条件的链接
        page.addTargetRequests(
                page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
        //获取页面需要的内容
//        page.putField("test",page.getHtml().xpath("//div[@class=\"para\"]/b/text()"));
        page.putField("authname", page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()"));
        auth.setId(count);
        auth.setName(page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").get());
        list.add(auth);
        System.out.println("抓取的内容:" +
                        page.getHtml().css("#Header1_HeaderTitle", "text").get() +
                        page.getHtml().xpath("//*[@id=\"mainContent\"]/div/a/@name").get() +
                        page.getHtml().xpath("//*[@id=\"home\"]/div[@id=\"footer\"]/text()").get()
//                    page.getHtml().xpath("//*[@id=\"cnblogs_post_body\"]/p/strong/text()").get()
//                    page.getHtml().xpath("//div[@class=\"para\"]/b/text()").get()
        );
        count++;
    }

    public List<AuthName> getAuthList() {
        return this.list;
    }
}

WebMagic启动代码

spiderStart为启动方法,insertData调用mybatis存入mysql

@Component
public class WebMagicAuth {
    MyProcessor myProcessor = new MyProcessor();

    @Autowired
    AuthNameService authNameService;

    public static  WebMagicAuth webMagicAuth;
    @PostConstruct
    public void init() {
        webMagicAuth = this;
        webMagicAuth.authNameService = this.authNameService;
    }


    public void spiderStart() {
        long startTime, endTime;
        //MyProcessor myProcessor =new MyProcessor();
        System.out.println("开始爬取...");
        startTime = System.currentTimeMillis();
        Spider.create(myProcessor).addUrl("https://www.cnblogs.com/")
                .addPipeline(new ConsolePipeline())
                .thread(1)
                .run();
        endTime = System.currentTimeMillis();
        System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+MyProcessor.count+"条记录");
        System.out.println("asd"+myProcessor.getAuthList().get(2).getName());

    }
    public void insertData()
    {
        spiderStart();
        List<AuthName> authNames = myProcessor.getAuthList();
        for (AuthName authName:
        myProcessor.getAuthList()) {
            System.out.println(authName.toString());
            if(authName.getId() == null){

            }else {
                webMagicAuth.authNameService.insertSelective(authName);
            }
        }
    }
}

遇见错误

mybatis There is no getter for property named ‘name’ in ‘class com.jatham.pojo.AuthName’
这行报错只需要检查XXmapper.xml文件,遇到保留字段,加个反引号,esc下面的
在这里插入图片描述
在这里插入图片描述
意为无法自动装载service
添加代码

@Autowired
    AuthNameService authNameService;

    public static  WebMagicAuth webMagicAuth;
    @PostConstruct
    public void init() {
        webMagicAuth = this;
        webMagicAuth.authNameService = this.authNameService;
    }

最后主方法调用

@SpringBootApplication
@MapperScan("com.jatham.mapper")
public class BootWebmagicApplication {

    public static void main(String[] args) {
        SpringApplication.run(BootWebmagicApplication.class, args);
        new WebMagicAuth().insertData();
    }
}

最终效果

mybatis逆向工程再生成实体类时会会出现this.name = name == null ? null : name.trim();
这是去null,但是我们再抓取的时候会出现null,然后报错,所以把它舍去,改为this.name = name

在这里插入图片描述在这里插入图片描述在这里插入图片描述

附录

docker容器化部署
1.maven封装jar包
2.编写dockerfile
3.docker build -t image-name .
4.docker run
参考链接

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!