本文共 7680 字,大约阅读时间需要 25 分钟。
作为非文艺青年,非技术青年的同学,闲暇(或忙碌但不想做事情的时候)浏览下豆瓣的日志总是种乐趣。发现豆瓣上的好多人很有趣,比如画彩铅的同学,有这样的兴趣真的很幸福。
闲话少说。这里是用python程序统计的豆瓣中喜欢数最多的400篇日志,觉得其中很多比较有趣的内容,所以与大家分享。
注:这是个很简单的抓取程序,准确率会随着抓取时间提高,目前的这个列表肯定非常不完善,而且日志的喜欢数肯定是不断变化的,所以这只是部分比较流行的日志。
豆瓣日志链接见
2012-10-07 09:31:11
日志链接,被喜欢数402
Python程序抓取用法: python douban.py > log.txtdouban.py如下:import urllib2import httplib2from BeautifulSoup import BeautifulSoup, SoupStrainerimport re#import Queueimport heapqimport timefrom time import strftimefrom urllib import FancyURLopenerfrom random import choiceuser_agents = [ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9']headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset':'GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip,deflate,sdch', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'bid="cRcN3O9ZJHU"; dbcl2="2359624:xkxvOaiLLTU"; ck="XxRv"; ll="108288"', 'Host':'www.douban.com', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4' }class MyOpener(FancyURLopener, object): version = choice(user_agents)#myopener = MyOpener()#print myopener.open('http://www.douban.com/').read()class PageWithLikes(): def __init__(self, url, numlikes): self.url = url self.numlikes = numlikes def __cmp__(self, other): return cmp(other.numlikes, self.numlikes)def dumppopular(populars, qsize): #print "[dumppopular]before: len(populars)", len(populars) populars.sort() with open("populars" + str(qsize) + ".txt", "w") as f: f.write(strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\n") for popular in populars: f.write(popular.url + "\t" + str(popular.numlikes) + "\n") #print "[dumppopular]after: len(populars)", len(populars)#print "qsize: ", populars.qsize()def getpopular(populars, url): #for url in "http://www.douban.com/note/239090787/", "http://www.douban.com/note/239349810/": #url = "http://www.douban.com/note/239090787/" try: time.sleep(3) #page = urllib2.urlopen(url) #soup = BeautifulSoup(page) print url visitedurls.add(url[:44]) status, response = http.request(url, headers=headers) #print status soup = BeautifulSoup(response, parseOnlyThese=SoupStrainer('span')) likespan = soup.findAll('span', {"class": "fav-num"})[0] numlikes = int(re.search('\d+', likespan.contents[1].contents[0]).group()) if int(numlikes) > 400: #print url #populars.put(PageWithLikes(url, numlikes)) heapq.heappush(populars, PageWithLikes(url, numlikes)) qsize = len(populars) if qsize % 100 == 0: print "qsize: ", qsize dumppopular(populars, qsize) except Exception, e: print e passif __name__ == "__main__": url = "http://www.douban.com" #noteurls = list() visitedurls = set() tovisiturls = [url] http = httplib2.Http() notepattern = re.compile("^http://www.douban.com/note/\d+/?$") #populars = Queue.PriorityQueue(maxsize = 1000) populars = list() print strftime("%Y-%m-%d %H:%M:%S", time.localtime()) while len(tovisiturls) > 0 and len(populars) < 1000: url = tovisiturls.pop() visitedurls.add(url[:44]) print url try: status, response = http.request(url, headers=headers) time.sleep(3) #print status #print response for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): if link.has_key('href'): href = link['href'] if href.startswith("http://www.douban.com") and not href.startswith("http://www.douban.com/feed") and href[:44] not in visitedurls: #print href if re.match(notepattern, href): tovisiturls.append(href) #noteurls.append(href) getpopular(populars, href) elif href.endswith("notes/") or href.endswith("notes"): tovisiturls.append(href) else: tovisiturls.insert(0, href) except Exception, e: print e pass
转载地址:http://kxxli.baihongyu.com/