跳至主要内容

python 2.7 web crawler

#coding=utf-8
#python 2.x
import re
import urllib
def getHtml(url):
page=urllib.urlopen(url)
html=page.read()
return html

html=getHtml("http://www.jianshu.com")
reg=r'<h4 class="title"><a target="_blank" href="(.*?)">(.*?)</a>'
hotre=re.compile(reg)
artlist=re.findall(hotre,html)

for article in artlist:
for com in article:
print com

评论