package main
import (
"github.com/PuerkitoBio/goquery"
"github.com/hu17889/go_spider/core/common/page"
"github.com/hu17889/go_spider/core/pipeline"
"github.com/hu17889/go_spider/core/spider"
)
type MyPageProcesser struct {
}
func NewMyPageProcesser() *MyPageProcesser {
return &MyPageProcesser{}
}
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
query := p.GetHtmlParser()
query.Find("td div[class='flex-middle']").Each(func(i int, s *goquery.Selection) {
println(s.Text())
})
}
func (*MyPageProcesser) Finish() {
}
func main() {
spider.NewSpider(NewMyPageProcesser(), "TaskName").
AddUrl("http://101.200.54.63/", "html"). // start url, html is the responce type ("html" or "json")
AddPipeline(pipeline.NewPipelineConsole()). // print result on screen
SetThreadnum(3). // crawl request by three Coroutines
Run()
}
import (
"github.com/PuerkitoBio/goquery"
"github.com/hu17889/go_spider/core/common/page"
"github.com/hu17889/go_spider/core/pipeline"
"github.com/hu17889/go_spider/core/spider"
)
type MyPageProcesser struct {
}
func NewMyPageProcesser() *MyPageProcesser {
return &MyPageProcesser{}
}
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
query := p.GetHtmlParser()
query.Find("td div[class='flex-middle']").Each(func(i int, s *goquery.Selection) {
println(s.Text())
})
}
func (*MyPageProcesser) Finish() {
}
func main() {
spider.NewSpider(NewMyPageProcesser(), "TaskName").
AddUrl("http://101.200.54.63/", "html"). // start url, html is the responce type ("html" or "json")
AddPipeline(pipeline.NewPipelineConsole()). // print result on screen
SetThreadnum(3). // crawl request by three Coroutines
Run()
}
评论
发表评论