学习java的正则表达式,抓取网页并解析HTML部分内容
- package com.xiaofeng.picup;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /** *//**
- *
- * @抓取页面文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个页面的全部内容
- *
- */
- public class WebContent ...{
- /** *//**
- * 读取一个网页全部内容
- */
- public String getOneHtml(String htmlurl) throws IOException...{
- URL url;
- String temp;
- StringBuffer sb = new StringBuffer();
- try ...{
- url = new URL(htmlurl);
- BufferedReader in = new BufferedReader(new InputStreamReader(url
- .openStream(), "utf-8"));// 读取网页全部内容
- while ((temp = in.readLine()) != null) ...{
- sb.append(temp);
- }
- in.close();
- }catch(MalformedURLException me)...{
- System.out.println("你输入的URL格式有问题!请仔细输入");
- me.getMessage();
- throw me;
- }catch (IOException e) ...{
- e.printStackTrace();
- throw e;
- }
- return sb.toString();
- }
- /** *//**
- *
- * @param s
- * @return 获得网页标题
- */
- public String getTitle(String s) ...{
- String regex;
- String title = "";
- List<String> list = new ArrayList<String>();
- regex = "<title>.*?</title>";
- Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
- Matcher ma = pa.matcher(s);
- while (ma.find()) ...{
- list.add(ma.group());
- }
- for (int i = 0; i < list.size(); i++) ...{
- title = title + list.get(i);
- }
- return outTag(title);
- }
- /** *//**
- *
- * @param s
- * @return 获得链接
- */
- public List<String> getLink(String s) ...{
- String regex;
- List<String> list = new ArrayList<String>();
- regex = "<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";
- Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
- Matcher ma = pa.matcher(s);
- while (ma.find()) ...{
- list.add(ma.group());
- }
- return list;
- }
- /** *//**
- *
- * @param s
- * @return 获得脚本代码
- */
- public List<String> getScript(String s) ...{
- String regex;
- List<String> list = new ArrayList<String>();
- regex = "<script.*?</script>";
- Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
- Matcher ma = pa.matcher(s);
- while (ma.find()) ...{
- list.add(ma.group());
- }
- return list;
- }
- /** *//**
- *
- * @param s
- * @return 获得CSS
- */
- public List<String> getCSS(String s) ...{
- String regex;
- List<String> list = new ArrayList<String>();
- regex = "<style.*?</style>";
- Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
- Matcher ma = pa.matcher(s);
- while (ma.find()) ...{
- list.add(ma.group());
- }
- return list;
- }
- /** *//**
- *
- * @param s
- * @return 去掉标记
- */
- public String outTag(String s) ...{
- return s.replaceAll("<.*?>", "");
- }