Java 写的简单网络爬虫


觉得好玩,昨天就研究了一下java爬虫。

在网上搜索了一些样例研究了一下。仿造写了一个简单的爬虫,可以自动爬取某本小说的章节(需要自定义正则表达式),利用 多线程+锁 可以爬的更快,也可以同时爬多本书。

目前针对的是起点小说网的正则,利用set和list存需要爬的链接和已经爬过的链接,再用map存某本书的名字,已经爬取的章节数等等,然后写到文件里面。

两个类实现

AllUrl.java

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.*;

public class AllUrl {
    public static int maxDepth = 100;//章节数
    public static int maxThread = 3;//最大线程数
    public static List waitUrl = new ArrayList<>();
    public static Set overUrl = new HashSet<>();
    public static Map UrlDepth = new HashMap<>();
    public static Map bookName = new HashMap<>();
    public static String savePath = "E:\\起点book\\";
    public static synchronized void workUrl(String url,int depth){
        if(depth > AllUrl.maxDepth){
            System.out.println("《"+bookName.get(url)+"》爬取达到设定的章节数,停止爬取。");
            SimpleDateFormat formatter= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
            Date date = new Date(System.currentTimeMillis());
            System.out.println(formatter.format(date));
            Thread.currentThread().interrupt();
        }else{
            if(AllUrl.overUrl.contains(url)){
                System.out.println(url+"已经爬取过");
            }else{
                try{
                    URL url1 = new URL(url);//新链接
                    URLConnection urlConnection = url1.openConnection();//链接 新链接
                    InputStream is = urlConnection.getInputStream();//获取链接的内容
                    BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"));//读取转换获取的内容到缓冲区
                    StringBuilder stringBuilder = new StringBuilder();//读取缓冲区内容
                    String tempString ;
                    while((tempString = br.readLine()) != null){
                        stringBuilder.append(tempString);//追加字符到stringBuilder
                    }

                    //这里的正则表示式针对的是https://www.qidian.com/ 起点中文网

                    String sStart = "class=\"read-content j_readContent\"", tem = "

", Sin = ""; StringBuilder nextUrl = new StringBuilder("); int start = stringBuilder.indexOf(sStart), end = stringBuilder.indexOf(tem, start + 1), AllEnd = stringBuilder.indexOf("

"); int nextUrlStart = stringBuilder.indexOf(String.valueOf(nextUrl)); nextUrlStart += nextUrl.length(); int nextUrlEnd = stringBuilder.indexOf("\"", nextUrlStart + 1); nextUrl.setLength(0);//清空 nextUrl.append("https:");//加前缀 nextUrl.append(stringBuilder.substring(nextUrlStart, nextUrlEnd));//加后缀 addUrl(nextUrl.toString(),depth+1,bookName.get(url)); start += sStart.length() + 20; AllEnd -= 10;//定义文章开始结束位置 PrintWriter pw = new PrintWriter(new File(savePath + bookName.get(url)+"第"+depth +"章"+ ".txt"));//建立存放数据的文件,文件名包含当前时间,防止重复 String sin = ""; while ((Sin = stringBuilder.substring(start, end)) != null) { pw.println(Sin);//写入文件 start = end + tem.length(); end = stringBuilder.indexOf(tem, start + 1); if (end == -1 || end >= AllEnd) { break; } } //这里的正则表示式针对的是https://www.qidian.com/ 起点中文网 pw.close(); br.close(); overUrl.add(url); System.out.println("《"+bookName.get(url)+"》已爬取,共爬取所有小说章节数量" + overUrl.size() + "剩余爬取章节数量:" + waitUrl.size()); }catch (Exception e){ e.printStackTrace(); } } } } public static synchronized String getUrl(){//取得最新Url if(!waitUrl.isEmpty()){ String tempUrl = waitUrl.get(0); waitUrl.remove(0); return tempUrl; }else return null; } public static synchronized void addUrl(String Url,int Depth,String bName){//添加Url if(overUrl.contains(Url) == false){ waitUrl.add(Url); UrlDepth.put(Url,Depth); bookName.put(Url,bName); System.out.println("《"+bookName.get(Url)+"》的章节"+Depth+"已经添加到待爬取队列,目前待爬取队列有"+waitUrl.size()+"个任务。"); }else{ System.out.println("《"+bookName.get(Url)+"》的章节"+Depth+"已经爬取过了,不再爬取。"); } } }
CrawlTheWeb.java
import java.sql.Time;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class CrawlTheWeb extends Thread{
    public static Object lock = new Object();//线程锁 用于使线程进入睡眠,或随机唤醒一个线程
    public void run(){
        while(true){
            if(Thread.currentThread().isInterrupted()){
                System.out.println(Thread.currentThread().getName()+"完成了任务。");
                break;
            }
            if(AllUrl.waitUrl.isEmpty() == false){
                String nextUrl = AllUrl.waitUrl.get(0);
                AllUrl.waitUrl.remove(0);
                AllUrl.workUrl(nextUrl,AllUrl.UrlDepth.get(nextUrl));
                System.out.println(this.getName()+"开始爬取《"+AllUrl.bookName.get(nextUrl)+"》,章节数: "+AllUrl.UrlDepth.get(nextUrl));
            }else{
                synchronized (lock){
                    try {
                        System.out.println("待爬取列表为空,"+this.getName()+"进入等待状态。");
                        lock.wait();
                    }catch (Exception e){
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    public static void main(String[] args) {
        SimpleDateFormat formatter= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
        Date date = new Date(System.currentTimeMillis());
        System.out.println(formatter.format(date));
        String strUrl = "https://read.qidian.com/chapter/D-1F0Iq1JGPOVUeyz9PqUQ2/DIfEaAmW-9X6ItTi_ILQ7A2/";//爬取的网页
        AllUrl.addUrl(strUrl,1,"模拟器:开局天牢死囚");
        strUrl = "https://read.qidian.com/chapter/W08HMrSPUHj7X4qr8VpWrA2/8W_pmmniqFvM5j8_3RRvhw2/";//爬取的网页
        AllUrl.addUrl(strUrl,1,"我的属性修行人生");
        strUrl = "https://read.qidian.com/chapter/q2B9dFLoeqU3v1oFI-DX8Q2/dsXQ94IHlUZp4rPq4Fd4KQ2/";//爬取的网页
        AllUrl.addUrl(strUrl,1,"这个武圣超有素质");
        for(int i=0;i){
            new CrawlTheWeb().start();
        }

    }
}

大多数人都推荐 实现 Runnable ,但是目前我还用不着,暂时先继承稍微熟悉一点的Thread。