您好,欢迎访问三七文档
网络爬虫源代码publicclassSpiderimplementsRunnable{privateArrayListurls;//URL列表privateHashMapindexedURLs;//已经检索过的URL列表privateintthreads;//初始化线程数publicstaticvoidmain(Stringargv[])throwsException{if(argv[0]==null){System.out.println(Missingrequiredargument:[SitURL]);return;}SpiderSpider=newSpider(argv[0]);Spider.go();}publicSpider(StringstrURL){urls=newArrayList();threads=10;urls.add(strURL);threadList=newArrayList();indexedURLs=newHashMap();if(urls.size()==0)thrownewIllegalArgumentException(Missingrequiredargument:-u[starturl]);if(threads1)(Invalidnumberofthreads:+threads);}publicvoidgo(StringstrURL)throwsException{//indexeachentrypointURLlongstart=System.currentTimeMillis();for(inti=0;ithreads;i++){Threadt=newThread(this,Spide+(i+1));t.start();threadList.add(t);}while(threadList.size();0){Threadchild=(Thread)threadList.remove(0);child.join();}longelapsed=System.currentTimeMillis()-start;}publicvoidrun(){Stringurl;try{while((url=dequeueURL())!=null){indexURL(url);}}catch(Exceptione){logger.info(e.getMessage());}}//检测URL列表容器中有没有URL没有被解析,如果有则返回URL由线程继续执行publicsynchronizedStringdequeueURL()throwsException{while(true){if(urls.size();0){return(String)urls.remove(0);}else{threads--;if(threads;0){wait();threads++;}else{notifyAll();returnnull;}}}}/**添加URL和当前URL的级数,并唤醒睡眠线程*/publicsynchronizedvoidenqueueURL(Stringurl,intlevel){if(indexedURLs.get(url)==null){urls.add(url);indexedURLs.put(url,newInteger(level));notifyAll();}}/***通过URL解析出网页内容并解析出页面上的URL*@paramurl页面链接*@throwsjava.lang.Exception*/privatevoidindexURL(Stringurl)throwsException{booleanflag=true;//判断网页链接的级别,系统默认为三级intlevel=1;if(indexedURLs.get(url)==null){indexedURLs.put(url,newInteger(level));}else{level=((Integer)indexedURLs.get(url)).intValue();//只检测到页面的第二级if(level;2)return;level++;}StringstrBody=null;try{//解析页面内容strBody=loadURL(url);}catch(Exceptione){return;}if(strBody!=null){StringurlGroups[]=null;try{//解析出页面所以URLurlGroups=parseURLs(summary);}catch(Exceptione){logger.info(e.getMessage());}if(urlGroups==null)urlGroups=newString[0];strBody=null;for(inti=0;iurlGroups.length;i++){enqueueURL(urlGroups[i],level);}}}}
本文标题:网络爬虫源代码
链接地址:https://www.777doc.com/doc-6862672 .html