前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。
实现理念:
maven依赖:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
<dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-core</artifactid> <version> 0.7 . 3 </version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> <exclusions> <exclusion> <groupid>org.slf4j</groupid> <artifactid>slf4j-log4j12</artifactid> </exclusion> </exclusions> </dependency> |
jdbc模式:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
ublic class csdnblogdao { private connection conn = null ; private statement stmt = null ; public csdnblogdao() { try { string url = "jdbc:mysql://localhost:3306/test?" + "user=***&password=***3&useunicode=true&characterencoding=utf8" ; conn = drivermanager.getconnection(url); stmt = conn.createstatement(); } catch (classnotfoundexception e) { e.printstacktrace(); } catch (sqlexception e) { e.printstacktrace(); } } public int add(csdnblog csdnblog) { try { string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);" ; preparedstatement ps = conn.preparestatement(sql); ps.setint( 1 , csdnblog.getkey()); ps.setstring( 2 , csdnblog.gettitle()); ps.setstring( 3 ,csdnblog.getcontent()); ps.setstring( 4 , csdnblog.getdates()); ps.setstring( 5 , csdnblog.gettags()); ps.setstring( 6 , csdnblog.getcategory()); ps.setint( 7 , csdnblog.getview()); ps.setint( 8 , csdnblog.getcomments()); ps.setint( 9 , csdnblog.getcopyright()); return ps.executeupdate(); } catch (sqlexception e) { e.printstacktrace(); } return - 1 ; } } |
实体类:
- public class csdnblog {
- private int key;// 编号
- private string title;// 标题
- private string dates;// 日期
- private string tags;// 标签
- private string category;// 分类
- private int view;// 阅读人数
- private int comments;// 评论人数
- private int copyright;// 是否原创
- private string content; //文字内容
- public string getcontent() {
- return content;
- }
- public void setcontent(string content) {
- this.content = content;
- }
- public int getkey() {
- return key;
- }
- public void setkey(int key) {
- this.key = key;
- }
- public string gettitle() {
- return title;
- }
- public void settitle(string title) {
- this.title = title;
- }
- public string getdates() {
- return dates;
- }
- public void setdates(string dates) {
- this.dates = dates;
- }
- public string gettags() {
- return tags;
- }
- public void settags(string tags) {
- this.tags = tags;
- }
- public string getcategory() {
- return category;
- }
- public void setcategory(string category) {
- this.category = category;
- }
- public int getview() {
- return view;
- }
- public void setview(int view) {
- this.view = view;
- }
- public int getcomments() {
- return comments;
- }
- public void setcomments(int comments) {
- this.comments = comments;
- }
- public int getcopyright() {
- return copyright;
- }
- public void setcopyright(int copyright) {
- this.copyright = copyright;
- }
- public string tostring() {
- return "csdnblog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category="
- + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";
- }
- }
启动类:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
public class csdnblogpageprocessor implements pageprocessor { private static string username= "chenyufeng1991" ; // 设置csdn用户名 private static int size = 0 ; // 共抓取到的文章数量 // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 ); public site getsite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(page page) { // 列表页 if (!page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/d+" ).match()) { // 添加所有文章页 page.addtargetrequests(page.gethtml().xpath( "//div[@id='article_list']" ).links()// 限定文章列表获取区域 .regex( "/" + username + "/article/details/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url .all()); // 添加其他列表页 page.addtargetrequests(page.gethtml().xpath( "//div[@id='papelist']" ).links()// 限定其他列表页获取区域 .regex( "/" + username + "/article/list/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url .all()); // 文章页 } else { size++; // 文章数量加1 // 用csdnblog类来存抓取到的数据,方便存入数据库 csdnblog csdnblog = new csdnblog(); // 设置编号 csdnblog.setkey(integer.parseint( page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/(d+)" ).get())); // 设置标题 csdnblog.settitle( page.gethtml().xpath( "//div[@class='article_title']//span[@class='link_title']/a/text()" ).get()); //设置内容 csdnblog.setcontent( page.gethtml().xpath( "//div[@class='article_content']/alltext()" ).get()); // 设置日期 csdnblog.setdates( page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_postdate']/text()" ).get()); // 设置标签(可以有多个,用,来分割) csdnblog.settags(listtostring(page.gethtml().xpath( "//div[@class='article_l']/span[@class='link_categories']/a/alltext()" ).all())); // 设置类别(可以有多个,用,来分割) csdnblog.setcategory(listtostring(page.gethtml().xpath( "//div[@class='category_r']/label/span/text()" ).all())); // 设置阅读人数 csdnblog.setview(integer.parseint(page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_view']" ) .regex( "(d+)人阅读" ).get())); // 设置评论人数 csdnblog.setcomments(integer.parseint(page.gethtml() .xpath( "//div[@class='article_r']/span[@class='link_comments']" ).regex( "((d+))" ).get())); // 设置是否原创 csdnblog.setcopyright(page.gethtml().regex( "bog_copyright" ).match() ? 1 : 0 ); // 把对象存入数据库 new csdnblogdao().add(csdnblog); // 把对象输出控制台 system.out.println(csdnblog); } } // 把list转换为string,用,分割 public static string listtostring(list<string> stringlist) { if (stringlist == null ) { return null ; } stringbuilder result = new stringbuilder(); boolean flag = false ; for (string string : stringlist) { if (flag) { result.append( "," ); } else { flag = true ; } result.append(string); } return result.tostring(); } public static void main(string[] args) { long starttime, endtime; system.out.println( "【爬虫开始】..." ); starttime = system.currenttimemillis(); // 从用户博客首页开始抓,开启5个线程,启动爬虫 spider.create( new csdnblogpageprocessor()).addurl( "http://blog.csdn.net/" + username).thread( 5 ).run(); endtime = system.currenttimemillis(); system.out.println( "【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000 ) + "秒,已保存到数据库,请查收!" ); } } |
使用mysql类型:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public class gamepageprocessor implements pageprocessor { private static final logger logger = loggerfactory.getlogger(gamepageprocessor. class ); private static dianjingservice d; private static bannerservice bs; private static sportservice ss; private static yulenewsservice ys; private static updateservice ud ; // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 ); public site getsite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public static void main(string[] args) { configurableapplicationcontext context= springapplication.run(gamepageprocessor. class , args); d = context.getbean(dianjingservice. class ); //spider.create(new gamepageprocessor()).addurl("网址").thread(5).run(); } public void process(page page) { selectable url = page.geturl(); if (url.tostring().equals( "网址" )) { dianjingvideo dv = new dianjingvideo(); list<string> ls = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()" ).all(); //hrefs list<string> ls1 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-link']/a/@href" ).all();//获取a标签的href list<string> ls2 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()" ).all(); //photo list<string> ls3 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-thumb']/img/@src" ).all(); for ( int i = 0 ; i < 5 ; i++) { dv.settitles(ls.get(i)); dv.setcategory( "" ); dv.setdates(ls2.get(i)); dv.sethrefs(ls1.get(i)); dv.setphoto(ls3.get(i)); dv.setsources( "" ); d.addvideo(dv); } } } |
controller:
- @controller
- @requestmapping(value = "/dianjing")
- public class dianjingcontroller {
- @autowired
- private dianjingservice s;
- /*
- 手游
- */
- @requestmapping("/dianjing")
- @responsebody
- public object dianjing(){
- list<dianjing> list = s.find2();
- jsonobject jo = new jsonobject();
- if(list!=null){
- jo.put("code",0);
- jo.put("success",true);
- jo.put("count",list.size());
- jo.put("list",list);
- }
- return jo;
- }
- }
实体类就不展示了
dao层
1
2
|
@insert ( "insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})" ) int adddj(dianjing dj); |
以上这篇springboot+webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://www.cnblogs.com/NCL--/p/8608336.html