Python-002 · caiwei8 · Jul 21, 2020 · Jul 24, 2020 · Jul 24, 2020 · Jul 24, 2020
diff --git a/week01/NOTE.md b/week01/NOTE.md
@@ -1 +1,44 @@
-学习笔记
+学习笔记
+
+w3c标准官方文档：https://www.w3.org/standards/
+网页分为3个部分：结构，表现和行为
+1.结构：定义网页的形状和展现形式，结构用html语言
+2.css主要是把我们的机构和表现形式做了分离
+3 .js脚本：定义网页行为
+
+
+当我们使用cookie时，代表着我们带着自己的用户名和密码的验证信息向网页发起请求。如果网页登录成功，也就是说cookie里面就包括了验证信息
+
+文字一般反正<span>标签里面
+链接一般用的<a>标签
+图片一般使用的<img>标签
+
+Scrapy的核心组件，见PPT p22-p24
+
+scrapy的setting.py可以修改爬虫的很多设置
+
+scrapy的选择器：
+//div...:表示从上向下去找，匹配条件可以放任意长的路径的
+./      :表示从你当前位置继续向下找
+../     :表示从你当前的上一级的位置继续向下找
+
+如果想取某个标签部分的属性的时候，要用/@href
+                的内容的时候，要用/text()
+
+scrapy.Request(....,dont_filter=False) dont_filter如果等于True,是用来解除去重功能。Scrapy 自带 url 去重功能，第二次请求之前会将已发送的请求自动进行过滤处理。所以将 dont_filter 设置为 True 起到的作用是解除去重功能，一旦设置成重 True，将不会去重，直接发送请求。
+
+yiled作为语句 与 return的区别：
+1.yiled更灵活，可以一个一个地返回所需要的值
+2.return返回的是对象，yiled返回的是单独的一个值，不用去考虑返回的数据类型（视频中老师所讲 yield 返回的是单独的一个值，更准确的说返回的值必须是对象，在此章节我们暂定只把它理解返回一个值。在后面的章节多线程部分，我们会结合课程再对 yield 进行详解）
+1. Scrapy Xpath 官方学习文档： https://docs.scrapy.org/en/latest/topics/selectors.html#working-with-xpaths
+2. Xpath 中文文档：
+https://www.w3school.com.cn/xpath/index.asp
+3. Xpath 英文文档：
+https://www.w3.org/TR/2017/REC-xpath-31-20170321/#nt-bnf
+4. yield 表达式官方文档：
+https://docs.python.org/zh-cn/3.7/reference/expressions.html#yieldexpr
+5. yield 语句官方文档
+https://docs.python.org/zh-cn/3.7/reference/simple_stmts.html#yield
+6. Python 推导式官方文档：
+https://docs.python.org/zh-cn/3.7/tutorial/datastructures.html#list-comprehensions
+
diff --git a/week01/homework2/spiders/movie2.csv b/week01/homework2/spiders/movie2.csv
@@ -0,0 +1,40 @@
+"电影名称:釜山行2：半岛
+电影类型:动作  惊悚
+上映时间:2020-07-15韩国上映
+"
+"电影名称:我在世界城等你
+电影类型:短片
+上映时间:2016-05-16中国大陆上映
+"
+"电影名称:第一次的离别
+电影类型:剧情  家庭
+上映时间:2020-07-20中国大陆上映
+"
+"电影名称:大话西游之大圣娶亲
+电影类型:喜剧  爱情  奇幻  古装
+上映时间:2020-07-24中国大陆重映
+"
+"电影名称:误杀
+电影类型:剧情  犯罪
+上映时间:2020-07-20中国大陆重映
+"
+"电影名称:天气之子
+电影类型:爱情  动画  奇幻
+上映时间:2019-11-01中国大陆上映
+"
+"电影名称:釜山行
+电影类型:动作  惊悚  灾难
+上映时间:2016-07-20韩国上映
+"
+"电影名称:少年的你
+电影类型:爱情  青春  剧情
+上映时间:2019-10-25中国大陆上映
+"
+"电影名称:唐人街探案2
+电影类型:喜剧  动作  悬疑
+上映时间:2018-02-16中国大陆上映
+"
+"电影名称:寻梦环游记
+电影类型:动画  冒险  家庭
+上映时间:2020-07-20中国大陆重映
+"
diff --git a/week01/homework2/spiders/scrapy.cfg b/week01/homework2/spiders/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = spiders.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = spiders
diff --git a/week01/homework2/spiders/spiders/__init__.py b/week01/homework2/spiders/spiders/__init__.py
diff --git a/week01/homework2/spiders/spiders/__pycache__/__init__.cpython-38.pyc b/week01/homework2/spiders/spiders/__pycache__/__init__.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/__pycache__/items.cpython-38.pyc b/week01/homework2/spiders/spiders/__pycache__/items.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/__pycache__/pipelines.cpython-38.pyc b/week01/homework2/spiders/spiders/__pycache__/pipelines.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/__pycache__/settings.cpython-38.pyc b/week01/homework2/spiders/spiders/__pycache__/settings.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/items.py b/week01/homework2/spiders/spiders/items.py
@@ -0,0 +1,16 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class SpidersItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    # pass
+    link = scrapy.Field()
+    movie_name = scrapy.Field()
+    movie_date = scrapy.Field()
+    genre = scrapy.Field()
diff --git a/week01/homework2/spiders/spiders/middlewares.py b/week01/homework2/spiders/spiders/middlewares.py
@@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class SpidersSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class SpidersDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/week01/homework2/spiders/spiders/pipelines.py b/week01/homework2/spiders/spiders/pipelines.py
@@ -0,0 +1,26 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+from itemadapter import ItemAdapter
+import pandas as pd
+
+movie_infor = []
+class SpidersPipeline:
+    def process_item(self, item, spider):
+        movie_name = item['movie_name']
+        genre = item['genre']
+        movie_date = item['movie_date']
+        # output = f'|{movie_name}|\t|{genre}|\t|{movie_date}|\n\n'
+        movie_infor.append(f'电影名称:{movie_name}\n电影类型:{genre}\n上映时间:{movie_date}\n')
+        # with open('./movie2.csv','a+',encoding='utf-8') as article:
+        #     article.write(output)
+        #     article.close()
+        movie2 = pd.DataFrame(data = movie_infor)
+        # windows需要使用gbk字符集
+        movie2.to_csv('./movie2.csv', encoding='utf8', index=False, header=False)
+        return item
+
+
diff --git a/week01/homework2/spiders/spiders/settings.py b/week01/homework2/spiders/spiders/settings.py
@@ -0,0 +1,88 @@
+# Scrapy settings for spiders project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'spiders'
+
+SPIDER_MODULES = ['spiders.spiders']
+NEWSPIDER_MODULE = 'spiders.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'spiders (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'spiders.middlewares.SpidersSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'spiders.middlewares.SpidersDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'spiders.pipelines.SpidersPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/week01/homework2/spiders/spiders/spiders/__init__.py b/week01/homework2/spiders/spiders/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/week01/homework2/spiders/spiders/spiders/__pycache__/__init__.cpython-38.pyc b/week01/homework2/spiders/spiders/spiders/__pycache__/__init__.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/spiders/__pycache__/maoyan.cpython-38.pyc b/week01/homework2/spiders/spiders/spiders/__pycache__/maoyan.cpython-38.pyc
diff --git a/week01/homework2/spiders/spiders/spiders/maoyan.py b/week01/homework2/spiders/spiders/spiders/maoyan.py
@@ -0,0 +1,45 @@
+import scrapy
+from spiders.items import SpidersItem
+import lxml.etree
+import requests
+
+
+class MaoyanSpider(scrapy.Spider):
+    name = 'maoyan'
+    allowed_domains = ['maoyan.com']
+    start_urls = ['http://maoyan.com/films?showType=3']
+
+    # def parse(self, response):
+    #     pass
+    def start_requests(self):
+        url = self.start_urls[0]
+        yield scrapy.Request(url=url, callback=self.parse)
+
+
+    # 解析函数，获得新链接
+    def parse(self, response):
+        selector = lxml.etree.HTML(response.text.replace("<dd>","</dd><dd>"))
+        new_links = selector.xpath('//*[@class="channel-detail movie-item-title"]/a/@href')
+        links = tuple(f'https://maoyan.com' + str(i) for i in new_links)
+        for i in range(10):
+            item = SpidersItem()
+            item['link'] = links[i]
+            yield scrapy.Request(url=links[i],meta={'item':item},callback=self.parse2)
+
+
+    # 解析具体页面,获得信息
+    def parse2(self, response):
+        item = response.meta['item']
+        selector = lxml.etree.HTML(response.text.replace("<dd>","</dd><dd>"))
+        # 转化为字符串，展示结果时更干净
+        movie_name1 = selector.xpath('/html/body/div[3]/div/div[2]/div[1]/h1/text()')
+        movie_name = "".join(movie_name1)
+        movie_genre = selector.xpath('//html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a/text()')
+        genre = "".join(movie_genre).strip()
+        movie_date1 = selector.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')
+        movie_date = "".join(movie_date1)
+        item['movie_name'] = movie_name
+        item['genre'] = genre
+        item['movie_date'] = movie_date
+        # items.append(item)
+        yield item