Pyspider应用笔记
安装pyspider
pip install pyspider |
启动环境
pyspider |
代理配置
'proxy': 'localhost:8080' |
忽略HTTPS
实例一(获取博客园标题URL信息):
validate_cert = False |
实例一(获取博客园标题URL信息):
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 博客园
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.cnblogs.com/', callback=self.index_page,validate_cert = False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('div[class="pager"]>a').items():
self.crawl(each.attr.href, callback=self.list_page,validate_cert = False)
@config(priority=1)
def list_page(self, response):
for each in response.doc('div[class="post_item"]>div[class="post_item_body"]>h3>a').items():
self.crawl(each.attr.href, callback=self.detail_page,validate_cert = False)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"urltitle": response.doc('title').text(),
}
|
实例二(获取豆瓣影评信息):
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 豆瓣影评
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://movie.douban.com/review/best/', callback=self.index_page,validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('div[class="paginator"]>a').items():
self.crawl(each.attr.href, callback=self.list_page,validate_cert=False)
@config(priority=1)
def list_page(self, response):
for each in response.doc('div[class="main-bd"]>h2>a').items():
self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('div[class="article"]>h1>span').text(),
"content":response.doc('div[id="link-report"]>div').text(),
}
|
实例三(获取多新奇网站文章标题URL信息):
#!/usr/bin/python
# _*_ coding:utf-8 _*_
# author: Robinn
# 数据库操作类
from six import itervalues
import MySQLdb
class SQL():
def __init__(self):
hosts = '127.0.0.1'
username = 'root'
password = '1456t'
database = 'dxq1'
charsets = 'utf8'
self.connection = False
try:
self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets)
self.cursor = self.conn.cursor()
self.cursor.esxecute("set names "+charsets)
self.connection = True
except Exception,e:
print "Cannot Connect To Mysql!/n",e
def escape(self,string):
return '%s' % string
def insxert(self,tablename=None,**values):
if self.connection:
tablename = self.escape(tablename)
if values:
_keys = ",".join(self.escape(k) for k in values)
_values = ",".join(['%s',]*len(values))
sql_query = "insxert into %s (%s) values (%s)" % (tablename,_keys,_values)
else:
sql_query = "replace into %s default values" % tablename
try:
if values:
self.cursor.esxecute(sql_query,list(itervalues(values)))
else:
self.cursor.esxecute(sql_query)
self.conn.commit()
return True
except Exception,e:
print "An Error Occured: ",e
return False
|
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 多新奇
from pyspider.database.mysql.mysqldb import SQL
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.duoxinqi.com/', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('div[id="wp_page_numbers"]>ul>li>a').items():
self.crawl(each.attr.href, callback=self.list_page)
@config(priority=1)
def list_page(self, response):
for each in response.doc('li[class="post"]>h2>a').items():
print(each)
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}
def on_result(self,result):
if not result or not result['url']:
return
sql = SQL()
sql.insxert('tb_dxq',**result)
|