導航:首頁 > 萬維百科 > java課程設計網頁爬蟲

java課程設計網頁爬蟲

發布時間:2020-09-17 00:47:55

1、求一個網頁爬蟲示例(JAVA)

Heritrix是個很好的開源爬蟲,http://crawler.archive.org/

2、用java編寫 網路爬蟲求代碼和流程 急

import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;//一個Web的爬行者(註:爬行在這里的意思與抓取,捕獲相同)
public class SearchCrawler extends JFrame{
//最大URL保存值
private static final String[] MAX_URLS={"50","100","500","1000"};

//緩存robot禁止爬行列表
private HashMap disallowListCache=new HashMap();

//搜索GUI控制項
private JTextField startTextField;
private JComboBox maxComboBox;
private JCheckBox limitCheckBox;
private JTextField logTextField;
private JTextField searchTextField;
private JCheckBox caseCheckBox;
private JButton searchButton;

//搜索狀態GUI控制項
private JLabel crawlingLabel2;
private JLabel crawledLabel2;
private JLabel toCrawlLabel2;
private JProgressBar progressBar;
private JLabel matchesLabel2;

//搜索匹配項表格列表
private JTable table;

//標記爬行機器是否正在爬行
private boolean crawling;

//寫日誌匹配文件的引用
private PrintWriter logFileWriter;

//網路爬行者的構造函數
public SearchCrawler(){
//設置應用程序標題欄
setTitle("搜索爬行者");
//設置窗體大小
setSize(600,600);

//處理窗體關閉事件
addWindowListener(new WindowAdapter(){
public void windowClosing(WindowEvent e){
actionExit();
}
});

//設置文件菜單
JMenuBar menuBar=new JMenuBar();
JMenu fileMenu=new JMenu("文件");
fileMenu.setMnemonic(KeyEvent.VK_F);
JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);
fileExitMenuItem.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
actionExit();
}
});
fileMenu.add(fileExitMenuItem);
menuBar.add(fileMenu);
setJMenuBar(menuBar);

3、Java網路爬蟲怎麼實現?

網路爬蟲是一個自動提取網頁的程序,它為搜索引擎從萬維網上下載網頁,是搜索引擎的重要組成。
傳統爬蟲從一個或若干初始網頁的URL開始,獲得初始網頁上的URL,在抓取網頁的過程中,不斷從當前頁面上抽取新的URL放入隊列,直到滿足系統的一定停止條件。對於垂直搜索來說,聚焦爬蟲,即有針對性地爬取特定主題網頁的爬蟲,更為適合。

以下是一個使用java實現的簡單爬蟲核心代碼:
public void crawl() throws Throwable {
while (continueCrawling()) {
CrawlerUrl url = getNextUrl(); //獲取待爬取隊列中的下一個URL
if (url != null) {
printCrawlInfo();
String content = getContent(url); //獲取URL的文本信息

//聚焦爬蟲只爬取與主題內容相關的網頁,這里採用正則匹配簡單處理
if (isContentRelevant(content, this.regexpSearchPattern)) {
saveContent(url, content); //保存網頁至本地

//獲取網頁內容中的鏈接,並放入待爬取隊列中
Collection urlStrings = extractUrls(content, url);
addUrlsToUrlQueue(url, urlStrings);
} else {
System.out.println(url + " is not relevant ignoring ...");
}

//延時防止被對方屏蔽
Thread.sleep(this.delayBetweenUrls);
}
}
closeOutputStream();
}
private CrawlerUrl getNextUrl() throws Throwable {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) && (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = this.urlQueue.remove();
//doWeHavePermissionToVisit:是否有許可權訪問該URL,友好的爬蟲會根據網站提供的"Robot.txt"中配置的規則進行爬取
//isUrlAlreadyVisited:URL是否訪問過,大型的搜索引擎往往採用BloomFilter進行排重,這里簡單使用HashMap
//isDepthAcceptable:是否達到指定的深度上限。爬蟲一般採取廣度優先的方式。一些網站會構建爬蟲陷阱(自動生成一些無效鏈接使爬蟲陷入死循環),採用深度限制加以避免
if (doWeHavePermissionToVisit(crawlerUrl)
&& (!isUrlAlreadyVisited(crawlerUrl))
&& isDepthAcceptable(crawlerUrl)) {
nextUrl = crawlerUrl;
// System.out.println("Next url to be visited is " + nextUrl);
}
}
return nextUrl;
}
private String getContent(CrawlerUrl url) throws Throwable {
//HttpClient4.1的調用與之前的方式不同
HttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url.getUrlString());
StringBuffer strBuf = new StringBuffer();
HttpResponse response = client.execute(httpGet);
if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {
HttpEntity entity = response.getEntity();
if (entity != null) {
BufferedReader reader = new BufferedReader(
new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
if (entity.getContentLength() > 0) {
strBuf = new StringBuffer((int) entity.getContentLength());
while ((line = reader.readLine()) != null) {
strBuf.append(line);
}
}
}
if (entity != null) {
nsumeContent();
}
}
//將url標記為已訪問
markUrlAsVisited(url);
return strBuf.toString();
}
public static boolean isContentRelevant(String content,
Pattern regexpPattern) {
boolean retValue = false;
if (content != null) {
//是否符合正則表達式的條件
Matcher m = regexpPattern.matcher(content.toLowerCase());
retValue = m.find();
}
return retValue;
}
public List extractUrls(String text, CrawlerUrl crawlerUrl) {
Map urlMap = new HashMap();
extractHttpUrls(urlMap, text);
extractRelativeUrls(urlMap, text, crawlerUrl);
return new ArrayList(urlMap.keySet());
}
private void extractHttpUrls(Map urlMap, String text) {
Matcher m = (text);
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
// System.out.println("Term = " + term);
if (term.startsWith("http")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
urlMap.put(term, term);
System.out.println("Hyperlink: " + term);
}
}
}
}
private void extractRelativeUrls(Map urlMap, String text,
CrawlerUrl crawlerUrl) {
Matcher m = relativeRegexp.matcher(text);
URL textURL = crawlerUrl.getURL();
String host = textURL.getHost();
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
if (term.startsWith("/")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
String s = //" + host + term;
urlMap.put(s, s);
System.out.println("Relative url: " + s);
}
}
}

}
public static void main(String[] args) {
try {
String url = "";
Queue urlQueue = new LinkedList();
String regexp = "java";
urlQueue.add(new CrawlerUrl(url, 0));
NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,
regexp);
// boolean allowCrawl = crawler.areWeAllowedToVisit(url);
// System.out.println("Allowed to crawl: " + url + " " +
// allowCrawl);
crawler.crawl();
} catch (Throwable t) {
System.out.println(t.toString());
t.printStackTrace();
}
}

4、如何java寫/實現網路爬蟲抓取網頁

原理即是保存cookie數據保存登陸後的cookie.以後每次抓取頁面把cookie在頭部信息裡面發送過去。系統是根據cookie來判斷用戶的。有了cookie就有了登錄狀態,以後的訪問都是基於這個cookie對應的用戶的。補充:Java是一種可以撰寫跨平台應用軟體的面向對象的程序設計語言。Java技術具有卓越的通用性、高效性、平台移植性和安全性,廣泛應用於PC、數據中心、游戲控制台、科學超級計算機、行動電話和互聯網,同時擁有全球最大的開發者專業社群。

5、java 網頁爬蟲

沒做過網路爬蟲,不過順手寫了個自動登錄貓撲打卡的程序你可以參考一下,需要的包是commons-logging.jar,commons-net-1.4.1.jar,commons-codec-1.3.jar,log4j.jar,httpclient-4.3.1.jar ,下面是源代碼,希望可以幫到你~~
package com.ly.mainprocess;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.StatusLine;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class Test1 {
public static void main(String[] args){
Test1 test1 = new Test1();
System.out.println(test1.process("******","******"));
}

@SuppressWarnings("deprecation")
public boolean process(String username,String password) {
boolean ret=false;
DefaultHttpClient httpclient = new DefaultHttpClient();
try {
HttpGet httpget;
HttpResponse response;
HttpEntity entity;

List<Cookie> cookies;

//組建登錄的post包
HttpPost httppost = new HttpPost("http://login.hi.mop.com/Login.do"); // 用戶登錄
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
nvps.add(new BasicNameValuePair("nickname", username));
nvps.add(new BasicNameValuePair("password", password));
nvps.add(new BasicNameValuePair("origURL", "http://hi.mop.com/SysHome.do"));
nvps.add(new BasicNameValuePair("loginregFrom", "index"));
nvps.add(new BasicNameValuePair("ss", "10101"));

httppost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));
httppost.addHeader("Referer", "http://hi.mop.com/SysHome.do");
httppost.addHeader("Connection", "keep-alive");
httppost.addHeader("Content-Type", "application/x-www-form-urlencoded");
httppost.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
httppost.addHeader("Origin", "http://hi.mop.com");
httppost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
response = httpclient.execute(httppost);
entity = response.getEntity();
// System.out.println("Login form get: " + response.getStatusLine());
EntityUtils.consume(entity);

// System.out.println("Post logon cookies:");
cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
// System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
// System.out.println("- " + cookies.get(i).toString());
}
}

//進行頁面跳轉
String url = ""; // 頁面跳轉
Header locationHeader = response.getFirstHeader("Location");
// System.out.println(locationHeader.getValue());
if (locationHeader != null) {
url = locationHeader.getValue(); // 得到跳轉href
HttpGet httpget1 = new HttpGet(url);
response = httpclient.execute(httpget1);
// 登陸成功。。。hoho
}
entity = response.getEntity();
// System.out.println(response.getStatusLine());
if (entity != null) {
// System.out.println("Response content length: " + entity.getContentLength());
}
// 顯示結果
BufferedReader reader = new BufferedReader(new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
while ((line = reader.readLine()) != null) {
// System.out.println(line);
}

//自動打卡
// 訪問網站的子網頁。
HttpPost httppost1 = new HttpPost("http://home.hi.mop.com/ajaxGetContinusLoginAward.do"); // 設置個人信息頁面
httppost1.addHeader("Content-Type", "text/plain;charset=UTF-8");
httppost1.addHeader("Accept", "text/plain, */*");
httppost1.addHeader("X-Requested-With", "XMLHttpRequest");
httppost1.addHeader("Referer", "http://home.hi.mop.com/Home.do");
response = httpclient.execute(httppost1);
entity = response.getEntity();
// System.out.println(response.getStatusLine());
if(response.getStatusLine().toString().indexOf("HTTP/1.1 200 OK")>=0){
ret = true;
}
if (entity != null) {
// System.out.println("Response content length: " + entity.getContentLength());
}
// 顯示結果
reader = new BufferedReader(new InputStreamReader(entity.getContent(), "UTF-8"));
line = null;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
} catch (Exception e) {

} finally {
httpclient.getConnectionManager().shutdown();
}
return ret;
}
}

6、急求一個java程序模板:我們有java數據結構課程設計:用java寫一個爬蟲,爬取兩個網頁的內容。

/

7、求用JAVA編寫一個網路爬蟲的程序

推薦一本書,《Java編程藝術》,Herbert Schildt、James Holmes 著。裡面剛好有你想要的例子。書里提供完整源代碼,能運行。

8、java 網路爬蟲怎麼實現

網路爬蟲是一個自動提取網頁的程序,它為搜索引擎從萬維網上下載網頁,是搜索引擎的重要組成。
傳統爬蟲從一個或若干初始網頁的URL開始,獲得初始網頁上的URL,在抓取網頁的過程中,不斷從當前頁面上抽取新的URL放入隊列,直到滿足系統的一定停止條件。對於垂直搜索來說,聚焦爬蟲,即有針對性地爬取特定主題網頁的爬蟲,更為適合。

以下是一個使用java實現的簡單爬蟲核心代碼:
public void crawl() throws Throwable {
while (continueCrawling()) {
CrawlerUrl url = getNextUrl(); //獲取待爬取隊列中的下一個URL
if (url != null) {
printCrawlInfo();
String content = getContent(url); //獲取URL的文本信息

//聚焦爬蟲只爬取與主題內容相關的網頁,這里採用正則匹配簡單處理
if (isContentRelevant(content, this.regexpSearchPattern)) {
saveContent(url, content); //保存網頁至本地

//獲取網頁內容中的鏈接,並放入待爬取隊列中
Collection urlStrings = extractUrls(content, url);
addUrlsToUrlQueue(url, urlStrings);
} else {
System.out.println(url + " is not relevant ignoring ...");
}

//延時防止被對方屏蔽
Thread.sleep(this.delayBetweenUrls);
}
}
closeOutputStream();
}
private CrawlerUrl getNextUrl() throws Throwable {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) && (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = this.urlQueue.remove();
//doWeHavePermissionToVisit:是否有許可權訪問該URL,友好的爬蟲會根據網站提供的"Robot.txt"中配置的規則進行爬取
//isUrlAlreadyVisited:URL是否訪問過,大型的搜索引擎往往採用BloomFilter進行排重,這里簡單使用HashMap
//isDepthAcceptable:是否達到指定的深度上限。爬蟲一般採取廣度優先的方式。一些網站會構建爬蟲陷阱(自動生成一些無效鏈接使爬蟲陷入死循環),採用深度限制加以避免
if (doWeHavePermissionToVisit(crawlerUrl)
&& (!isUrlAlreadyVisited(crawlerUrl))
&& isDepthAcceptable(crawlerUrl)) {
nextUrl = crawlerUrl;
// System.out.println("Next url to be visited is " + nextUrl);
}
}
return nextUrl;
}
private String getContent(CrawlerUrl url) throws Throwable {
//HttpClient4.1的調用與之前的方式不同
HttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url.getUrlString());
StringBuffer strBuf = new StringBuffer();
HttpResponse response = client.execute(httpGet);
if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {
HttpEntity entity = response.getEntity();
if (entity != null) {
BufferedReader reader = new BufferedReader(
new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
if (entity.getContentLength() > 0) {
strBuf = new StringBuffer((int) entity.getContentLength());
while ((line = reader.readLine()) != null) {
strBuf.append(line);
}
}
}
if (entity != null) {
nsumeContent();
}
}
//將url標記為已訪問
markUrlAsVisited(url);
return strBuf.toString();
}
public static boolean isContentRelevant(String content,
Pattern regexpPattern) {
boolean retValue = false;
if (content != null) {
//是否符合正則表達式的條件
Matcher m = regexpPattern.matcher(content.toLowerCase());
retValue = m.find();
}
return retValue;
}
public List extractUrls(String text, CrawlerUrl crawlerUrl) {
Map urlMap = new HashMap();
extractHttpUrls(urlMap, text);
extractRelativeUrls(urlMap, text, crawlerUrl);
return new ArrayList(urlMap.keySet());
}
private void extractHttpUrls(Map urlMap, String text) {
Matcher m = (text);
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
// System.out.println("Term = " + term);
if (term.startsWith("http")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
urlMap.put(term, term);
System.out.println("Hyperlink: " + term);
}
}
}
}
private void extractRelativeUrls(Map urlMap, String text,
CrawlerUrl crawlerUrl) {
Matcher m = relativeRegexp.matcher(text);
URL textURL = crawlerUrl.getURL();
String host = textURL.getHost();
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
if (term.startsWith("/")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
String s = //" + host + term;
urlMap.put(s, s);
System.out.println("Relative url: " + s);
}
}
}

}
public static void main(String[] args) {
try {
String url = "";
Queue urlQueue = new LinkedList();
String regexp = "java";
urlQueue.add(new CrawlerUrl(url, 0));
NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,
regexp);
// boolean allowCrawl = crawler.areWeAllowedToVisit(url);
// System.out.println("Allowed to crawl: " + url + " " +
// allowCrawl);
crawler.crawl();
} catch (Throwable t) {
System.out.println(t.toString());
t.printStackTrace();
}
}

與java課程設計網頁爬蟲相關的知識