安装Goose
Python-Goose项目地址:https://github.com/grangier/python-goose
git clone https://github.com/grangier/python-goose.git cd python-goose python setup.py install |
提取煎蛋网专题标题:
#!/usr/bin/python
# _*_ coding:utf-8 _*_
# author: Robinn
from goose import Goose
from bs4 import BeautifulSoup
class SourceTitle:
def __init__(self,url,pagenum):
self.url = url
self.pagenum = pagenum
def spider(self):
titleList = []
for i in self.pagenum:
g = Goose()
article = g.extract(url=self.url+str(i))
soup = BeautifulSoup(article.raw_html,"html.parser")
post_list = soup.find_all(class_="title2")
for line in post_list:
titleList.append(line.a.text)
return titleList
if __name__ == "__main__":
pageList = range(1,11)
url = "http://jandan.net/tag/%E8%B5%B0%E8%BF%9B%E7%A7%91%E5%AD%A6/page/"
gs = SourceTitle(url,pageList)
plist = gs.spider()
for title in plist:
print(title)
|
过滤中文停用词:
#!/usr/bin/python
# _*_ coding:utf-8 _*_
# author: Robinn
from goose import Goose
from goose.text import StopWordsChinese
if __name__ == "__main__":
url = "http://jandan.net/ooxx/page-1#comments"
g = Goose({"stopwords_class":StopWordsChinese})
article = g.extract(url=url)
print(article.cleaned_text[:150])
|
更多关于Goose的API使用请参考github项目源码:https://github.com/grangier/python-goose