跳至主要内容

python 2.7 web crawler

#coding=utf-8
#python 2.x
import re
import urllib
def getHtml(url):
page=urllib.urlopen(url)
html=page.read()
return html

html=getHtml("http://www.jianshu.com")
reg=r'<h4 class="title"><a target="_blank" href="(.*?)">(.*?)</a>'
hotre=re.compile(reg)
artlist=re.findall(hotre,html)

for article in artlist:
for com in article:
print com

评论

此博客中的热门博文

android hide actionbar

public class MainActivity extends Activity { ActionBar actionBar; //声明ActionBar @Override protected void onCreate( Bundle savedInstanceState) { super .onCreate(savedInstanceState); setContentView( R .layout.activity_main); actionBar = getSupportActionBar(); //得 到ActionBar actionBar.hide(); //隐藏ActionBar } }

go url encoding

func  QueryUnescape func QueryUnescape (s string ) ( string , error ) QueryUnescape does the inverse transformation of QueryEscape, converting %AB into the byte 0xAB and '+' into ' ' (space). It returns an error if any % is not followed by two hexadecimal digits. func  QueryUnescape func QueryUnescape (s string ) ( string , error ) QueryUnescape does the inverse transformation of QueryEscape, converting %AB into the byte 0xAB and '+' into ' ' (space). It returns an error if any % is not followed by two hexadecimal digits.