雨橙_Robinn.top_#Python爬虫#--Pyspider应用之信息爬取

Pyspider应用笔记

安装pyspider

pip install pyspider

启动环境

pyspider

代理配置

'proxy': 'localhost:8080'

忽略HTTPS

validate_cert = False

实例一(获取博客园标题URL信息)：

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 博客园

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.cnblogs.com/', callback=self.index_page,validate_cert = False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('div[class="pager"]>a').items():            
            self.crawl(each.attr.href, callback=self.list_page,validate_cert = False)   

    @config(priority=1)
    def list_page(self, response):
        for each in response.doc('div[class="post_item"]>div[class="post_item_body"]>h3>a').items():            
            self.crawl(each.attr.href, callback=self.detail_page,validate_cert = False) 
            
    @config(priority=2)
    def detail_page(self, response):        
        return {
            "url": response.url,
            "urltitle": response.doc('title').text(),
        }

实例二(获取豆瓣影评信息)：

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 豆瓣影评

from pyspider.libs.base_handler import *

class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://movie.douban.com/review/best/', callback=self.index_page,validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('div[class="paginator"]>a').items():
            self.crawl(each.attr.href, callback=self.list_page,validate_cert=False)    

    @config(priority=1)
    def list_page(self, response):
        for each in response.doc('div[class="main-bd"]>h2>a').items():
            self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)  
            
    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('div[class="article"]>h1>span').text(),
            "content":response.doc('div[id="link-report"]>div').text(),
        }

实例三(获取多新奇网站文章标题URL信息)：

#!/usr/bin/python
# _*_ coding:utf-8 _*_
# author: Robinn
# 数据库操作类

from six import itervalues
import MySQLdb

class SQL():
    def __init__(self):
        hosts    = '127.0.0.1'
        username = 'root'
        password = '1456t'
        database = 'dxq1'
        charsets = 'utf8'

        self.connection = False
        try:
            self.conn = MySQLdb.connect(host = hosts,user = username,passwd = password,db = database,charset = charsets)
            self.cursor = self.conn.cursor()
            self.cursor.esxecute("set names "+charsets)
            self.connection = True
        except Exception,e:
            print "Cannot Connect To Mysql!/n",e

    def escape(self,string):
        return '%s' % string

    def insxert(self,tablename=None,**values):

        if self.connection:
            tablename = self.escape(tablename)
            if values:
                _keys = ",".join(self.escape(k) for k in values)
                _values = ",".join(['%s',]*len(values))
                sql_query = "insxert into %s (%s) values (%s)" % (tablename,_keys,_values)
            else:
                sql_query = "replace into %s default values" % tablename
            try:
                if values:
                    self.cursor.esxecute(sql_query,list(itervalues(values)))
                else:
                    self.cursor.esxecute(sql_query)
                self.conn.commit()
                return True
            except Exception,e:
                print "An Error Occured: ",e
                return False

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Project: 多新奇

from pyspider.database.mysql.mysqldb import SQL
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.duoxinqi.com/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('div[id="wp_page_numbers"]>ul>li>a').items():
            self.crawl(each.attr.href, callback=self.list_page)

    @config(priority=1)
    def list_page(self, response):
        for each in response.doc('li[class="post"]>h2>a').items():
            print(each)
            self.crawl(each.attr.href, callback=self.detail_page)
            
    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }

    def on_result(self,result):
        if not result or not result['url']:
            return
        sql = SQL()
        sql.insxert('tb_dxq',**result)

posted at 2015-06-16

雨橙

中国.四川.成都