BookReptile.exe是一款用go語言寫的某書屋爬蟲,有代碼,有成品,大家笑納,今天寫了一個書屋的爬蟲,獻(xiàn)給愛讀書的朋友們,該網(wǎng)站有兩種下載頁,一個城通,一個百度云,城通沒有密碼,百度云含密碼,才學(xué)go語言,所以沒有去重功能,每次會重新新建txt文檔,之前爬好的數(shù)據(jù),注意保存哦.
主要功能
爬取書名+下載網(wǎng)址+密碼
使用說明
這是單線程的,多線程,
多線程,直接go working(i)即可.
再加入channel通知主go程退出即可
軟件代碼
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
var count int
func main() {
var start, end int
fmt.Print("起始頁(>=1):")
fmt.Scan(&start)
fmt.Print("終止頁(>=起始頁[適度爬取,太多小心IP被封哦]):")
fmt.Scan(&end)
//創(chuàng)建文件
fc,err:=os.Create("BookList.txt")
if err!=nil{
fmt.Println("os.Create err",err)
return
}
fc.Close()
//循環(huán)讀取每一頁
for i := start; i <= end; i++ {
working(i)
}
fmt.Println("爬取完畢,馬上閃開!!!")
}
func working(idx int) {
//打開文件
fo,err:=os.OpenFile("BookList.txt",os.O_APPEND,6)
if err!=nil{
fmt.Println("os.OpenFile err",err)
return
}
defer fo.Close()
url := "https://www.bukebook.cn/page/" + strconv.Itoa(idx)
result, err := httpGet(url, idx)
if err != nil {
fmt.Println("檢查網(wǎng)絡(luò),或者IP被封了...")
return
}
//正則處理信息獲得bookID
//正則規(guī)則
bookIDRule :=`class="greatwp-fp04-post-title"><a href="https://www.bukebook.cn/([0-9]+).html" rel="bookmark">`
bookNameRule := `.html">《(?s:(.*?))</a></h2>`
CTUrlRule := `<a class="ordown-button" href="(?s:(.*?))" target="_blank">城通網(wǎng)盤</a>`
psdRule := `<strong>提取秘鑰: </strong>(?s:(.*?)) </br>`
allID:=regexpData(result, bookIDRule)
for i,tmpID:=range allID{
bookID:=tmpID[1]
//拼接下載頁URL
dlUrl:="https://www.bukebook.cn/wp-content/plugins/ordown/down.php?id="+bookID
//訪問下載頁
dlResult,err:=httpGet(dlUrl,i)
if err!=nil{
fmt.Println("dl httpGet err",err)
return
}
//處理數(shù)據(jù)獲取書名,下載地址及密碼
allBookName:=regexpData(dlResult,bookNameRule)
allCTUrl:=regexpData(dlResult,CTUrlRule)
allPsd:=regexpData(dlResult,psdRule)
//fmt.Println(dlResult)
for _,tmpBookName:=range allBookName{
bookName:=tmpBookName[1]
//判斷網(wǎng)盤類型
if strings.Contains(dlResult,"百度云盤"){
count++
//封裝百度網(wǎng)盤URL
BDUrl:="https://www.bukebook.cn/wp-content/plugins/ordown/download1.php?id="+bookID
//獲取網(wǎng)盤密碼
for _,tmpPsd:=range allPsd{
//存儲書名及城通地址
fo.Write([]byte(strconv.Itoa(count)+".《"+bookName+"\n"+BDUrl+" "+tmpPsd[1]+"\n"))
fmt.Println("《"+bookName+" 完成\n")
}
}else{
count++
//獲取城通網(wǎng)址
for _,tmpCTUrl:=range allCTUrl{
//存儲書名及城通地址
fo.Write([]byte(strconv.Itoa(count)+".《"+bookName+"\n"+tmpCTUrl[1]+"\n"))
fmt.Println("《"+bookName+" 完成\n")
}
}
}
}
}
func regexpData(data, rule string) [][]string {
reg := regexp.MustCompile(rule)
return reg.FindAllStringSubmatch(data, -1)
}
func httpGet(url string, idx int) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
result += string(buf[:n])
}
return
}