#Python爬虫#--Scrapy入门基本应用

user

雨橙

中国.四川.成都

世界之上、唯有远见、惟爱不变。


安装Scrapy
pip install scrapy

创建项目
scrapy startproject demo1

编写一个爬虫
#!/usr/bin/python
# _*_coding:utf-8 _*_
# author: robinn

import scrapy

class BologsSpider(scrapy.Spider):
    name = "blogs"
    allowed_domains = ["cnblogs.com"]
    start_urls = [
        "https://www.cnblogs.com/fightccc/p/8616068.html",
        "https://www.cnblogs.com/taoxu/p/8602334.html"
    ]

    def parse(self, response):
        filename = response.url.split("/")[-1]
        with open(filename, 'wb') as f:
            f.write(response.body)
 
运行爬取
scrapy crawl blogs

 
启动终端(可以先安装IPython)
scrapy shell <url>
 
Ipython下可以直接输出响应的URL内容。相应头,选择器
response
response.body
response.headers
response.selector
response.selector.xpath("//title")
response.xpath("//title")
 
selector有四个基本的方法:
xpath(): 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表。
css(): 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表。
extract(): 序列化该节点为unicode字符串并返回list。
re(): 根据传入的正则表达式对数据进行提取,返回unicode字符串list列表。
 
 
以博客园为例:
提取文章标题Xpath
response.xpath("//*[@id='news_list']//div[@class='news_block']//div[@class='content']//h2[@class='news_entry']//a//text()").extract()
 
提取文章摘要Xpath
response.xpath("//*[@id='news_list']//div[@class='news_block']//div[@class='content']//div[@class='entry_summary']//text()").extract()


编写一个爬虫提取博客园标题和摘要。分别写入文件和输出到控制台
#!/usr/bin/python
# _*_coding:utf-8 _*_
# author: robinn

import scrapy
from .. items import TestscrapydemoItem

class BologsSpider(scrapy.Spider):
    name = "blogs"
    allowed_domains = ["cnblogs.com"]
    start_urls = [
        "https://news.cnblogs.com/"
    ]

    def parse(self, response):
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        with open("blogs.txt", "w") as f:
            for sel in response.xpath("//*[@id='news_list']/div[@class='news_block']/div[@class='content']"):
                title = sel.xpath("h2[@class='news_entry']/a/text()").extract()
                contents = sel.xpath("div[@class='entry_summary']/text()").extract()

                f.write(title[0].strip()+"\n")
                f.write(contents[1].strip()+"\n")
                f.write("\n")

                title = title[0].strip()
                contents = contents[1].strip()
                print(title)
                print(contents)

                item = TestscrapydemoItem()
                item["title"] = title
                item["contents"] = contents
                yield item
 
 
将爬取输出到Json数据
scrapy crawl blogs -o blogs.json

posted at