建筑档案网站采集

96569ca0 · 梦想橡皮擦 · 27c82d81 · 96569ca0 · 96569ca0 · 96569ca0
17 changed file
--- a/案例30/mySpider/__init__.py
+++ b/案例30/mySpider/__init__.py
--- a/案例30/mySpider/__pycache__/__init__.cpython-38.pyc
+++ b/案例30/mySpider/__pycache__/__init__.cpython-38.pyc
--- a/案例30/mySpider/__pycache__/items.cpython-38.pyc
+++ b/案例30/mySpider/__pycache__/items.cpython-38.pyc
--- a/案例30/mySpider/__pycache__/pipelines.cpython-38.pyc
+++ b/案例30/mySpider/__pycache__/pipelines.cpython-38.pyc
--- a/案例30/mySpider/__pycache__/settings.cpython-38.pyc
+++ b/案例30/mySpider/__pycache__/settings.cpython-38.pyc
--- a/案例30/mySpider/begin.py
+++ b/案例30/mySpider/begin.py
+from scrapy import cmdline
+cmdline.execute(("scrapy crawl jianzhu").split())
\ No newline at end of file
--- a/案例30/mySpider/items.py
+++ b/案例30/mySpider/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class MyspiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    # 标题
+    title = scrapy.Field()
+    # 发布人
+    userName = scrapy.Field()
+    # 发布时间
+    createTime = scrapy.Field()
\ No newline at end of file
--- a/案例30/mySpider/middlewares.py
+++ b/案例30/mySpider/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class MyspiderSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class MyspiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/案例30/mySpider/pipelines.py
+++ b/案例30/mySpider/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+import os
+import csv
+
+class MyspiderPipeline:
+
+    def __init__(self):
+        # csv 文件
+        store_file = os.path.dirname(__file__)+"/spiders/school1.csv"
+        self.file = open(store_file,"a+",newline='',encoding="utf-8")
+        self.writer = csv.writer(self.file)
+
+    def process_item(self, item, spider):
+        try:
+     
+            self.writer.writerow((
+                item["title"],
+                item["userName"],
+                item["createTime"]
+            ))
+
+        except Exception as e:
+            print(e.args)
+
+
+    def close_spider(self,spider):
+        self.file.close()
+
--- a/案例30/mySpider/settings.py
+++ b/案例30/mySpider/settings.py
+# Scrapy settings for mySpider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'mySpider'
+
+SPIDER_MODULES = ['mySpider.spiders']
+NEWSPIDER_MODULE = 'mySpider.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'mySpider (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'mySpider.pipelines.MyspiderPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/案例30/mySpider/spiders/__init__.py
+++ b/案例30/mySpider/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/案例30/mySpider/spiders/__pycache__/__init__.cpython-38.pyc
+++ b/案例30/mySpider/spiders/__pycache__/__init__.cpython-38.pyc
--- a/案例30/mySpider/spiders/__pycache__/jiansheku.cpython-38.pyc
+++ b/案例30/mySpider/spiders/__pycache__/jiansheku.cpython-38.pyc
--- a/案例30/mySpider/spiders/__pycache__/jianzhu.cpython-38.pyc
+++ b/案例30/mySpider/spiders/__pycache__/jianzhu.cpython-38.pyc
--- a/案例30/mySpider/spiders/jianzhu.py
+++ b/案例30/mySpider/spiders/jianzhu.py
+import scrapy
+from scrapy import FormRequest
+import json
+from items import MyspiderItem
+
+
+class JianshekuSpider(scrapy.Spider):
+    name = 'jianzhu'
+    allowed_domains = ['admin.jzda001.com']
+    start_url = 'https://admin.jzda001.com/api/core/002--newsList'
+
+    def __init__(self):
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Referer": "hhttps://www.baidu.com/xyzg/"
+        }
+
+    # 需要重写 start_requests() 方法
+
+    def start_requests(self):
+        # 只获取 20 页数据
+        for page in range(1, 5):
+            form_data = {
+                "type": "1",
+                "limit": "17",
+                "pageNo": str(page)
+            }
+
+            request = FormRequest(
+                self.start_url, headers=self.headers, formdata=form_data, callback=self.parse)
+            yield request
+
+    def parse(self, response):
+        # print(response.body)
+        # print(response.url)
+        # print(response.body_as_unicode())
+        data = json.loads(response.body_as_unicode())
+        data = data["rows"] # 获取数据
+        print(data)
+        for item in data:
+            school = MyspiderItem()
+            school["title"] = item["title"]
+            school["userName"] = item["userName"]
+            school["createTime"] = item["createTime"]
+         
+            # 将获取的数据交给pipelines，pipelines在settings.py中定义
+            yield school
--- a/案例30/mySpider/spiders/school1.csv
+++ b/案例30/mySpider/spiders/school1.csv
+弘阳服务：上半年归属股东净利润5983万元 同比增156%,中国网地产,2021-08-19 22:53:48
+灵感库丨城市中消失的“围墙”，让空间连接生活,Roca Gallery乐家艺术廊,2021-08-19 18:38:13
+【建言︱对话大桥谕】：延续扎哈设计思想，探索未来城市,Roca Gallery乐家艺术廊,2021-08-19 18:38:08
+半年报快读|危中有机 “零踩线”越秀地产稳中求进,中国网地产,2021-08-19 15:14:06
+John Gillen荣升Aedas执行董事,Aedas,2021-08-19 14:30:55
+喜讯丨四项由Aedas设计的项目荣膺2021年国际建筑大奖,Aedas,2021-08-19 14:30:43
+Aedas设计的香港西贡WM酒店携手2021香港小姐美丽同行！,Aedas,2021-08-19 14:30:30
+李忠观点丨从东亚文化消费发展，看中国国潮崛起（三）,华高莱斯,2021-08-19 11:43:23
+喜讯丨承构建筑荣获两项法国Novum Design Award建筑设计类别最高奖项,承构建筑,2021-08-19 09:28:40
+新时代的科学城：你应该了解的6大命题！,TOP创新区研究院,2021-08-19 09:27:34
+此心安处 | 海口罗牛山玖悦台 · 金樾府 铂樾府,奥雅设计LA-2013,2021-08-19 09:27:05
+趋势|芒果雪糕色,戴昆学习小组,2021-08-19 09:24:38
+优客工场企业服务生态集群之 | 省广众烁：打造低成本高品质的数字整合营销服务,优客工场ucommune,2021-08-19 09:24:07
+半年报解读丨时代中国：上半年收入降至136.38亿元 非控股股东权益利润分配成谜,中国网地产,2021-08-19 00:09:43
+卓越商企服务：预计上半年归属股东净利润同比增55%,中国网地产,2021-08-19 00:08:57
+“玺悦相逢 致敬望京” 2021望京国际化发展主题峰会周六启幕,中国网地产,2021-08-20 13:16:00
+喜讯 | GLC两项作品入围英国SBID国际设计大奖Finalist，投票启动！,GLC(中国）,2021-08-21 20:14:48
+建业地产：2021年上半年归属股东净利润7.29亿元 同比上升0.3%,中国网地产,2021-08-19 00:08:03
+宝龙地产：上半年归属股东净利润39.08亿元 同比上升约76.1%,中国网地产,2021-08-19 00:07:00
+筑土分享丨联合国第六次气候变化报告发布：地球向人类发出的“红色警报”,筑土国际,2021-08-20 11:27:49
+UNStudio「索契海滨包容性规划设计」被评选为获胜方案,UNStudio,2021-08-20 10:58:13
+澳大利亚墨尔本南岸项目中标三周年!,UNStudio,2021-08-20 10:57:04
+"佳期如约, 荣光而至 | 2021年中项目荣誉合集",致逸设计,2021-08-20 09:24:56
+做产业创新区的五大雷区,TOP创新区研究院,2021-08-20 09:17:56
+金科服务：上半年归属股东净利润5.29亿元 同比增80.3%,中国网地产,2021-08-19 23:03:43
+华发物业服务：预计上半年归属股东净利润同比增长超400%,中国网地产,2021-08-19 23:03:03
+中国宏泰发展：上半年归属股东净利润1.03亿元 同比减少82.23%,中国网地产,2021-08-19 23:02:09
+南国置业：上半年归属股东净亏损4.23亿元,中国网地产,2021-08-19 23:00:21
+远洋集团：上半年归属股东净利润10.10亿元,中国网地产,2021-08-19 22:59:12
+半年报快读|半年收租8亿元 “包租公”SOHO中国的生意场,中国网地产,2021-08-19 22:58:29
+华发股份：上半年归属股东净利润16.87亿元 同比增15.03%,中国网地产,2021-08-19 22:57:46
+南京高科：上半年归属股东净利润14.32亿元 同比增长0.68%,中国网地产,2021-08-19 22:56:36
+荣盛发展：上半年归属股东净利润25.32亿元,中国网地产,2021-08-19 22:55:46
+复星旅游文化：上半年归属股东净亏损20.04亿元,中国网地产,2021-08-19 22:55:09
+商办市场高质量找房网站，速读网六大优势了解一下,优客工场ucommune,2021-08-23 09:23:36
+行村设计 | 藏如明信片般的四季风景 202村温泉民宿,建筑档案,2021-08-20 18:24:23
+谈加薪，先要一杯水；谈离职，要杯星巴克。,那小海,2021-08-20 18:02:47
+新书出版 |《SOM 作品精选》系列丛书最新版：收录过去十年最脍炙人口的设计作品,SOM设计事务所,2021-08-20 17:17:31
+UNStudio赢得韩国忠南美术馆设计竞赛,UNStudio,2021-08-20 10:58:26
+UNStudio 亚洲工作室 | 建成项目,UNStudio,2021-08-18 14:31:26
+重磅嘉宾阵容公布，五大前沿主题蓄势待发 | 2021上海国际建筑文化周,建筑档案,2021-08-18 10:09:07
+独家 | 成都仁恒置地广场将如何改造？,伍兹贝格建筑设计事务所,2021-08-17 09:37:27
+佳兆业美好：上半年归属股东净利润2.09亿元 同比增75.6%,中国网地产,2021-08-19 22:54:29
+政策篇 | 严禁高杠杆企业结构化发债，多地新房销售政策趋严, 建诚晟业,2021-08-23 09:20:11
+GLC | 作品合集,GLC(中国）,2021-08-21 20:15:26
+北京住建委：严禁样板间“货不对板”；华润置地前7月合同销售金额约1878.1亿元丨地产财经早餐,中国网地产,2021-08-21 06:45:03
+北京前7月商品房销售面积603.6万平方米；中海物业上半年归属股东净利润3.93亿港元丨地产财经早餐,中国网地产,2021-08-21 06:44:55
+北京加强公租房资格复核及分配管理；雅居乐上半年归属股东净利润52.9亿元丨地产财经早餐,中国网地产,2021-08-21 06:44:37
+河北严禁无证认购、认筹等变相售房行为；荣盛发展上半年归属股东净利润25.32亿元丨地产财经早餐,中国网地产,2021-08-21 06:43:20
+杭州推出“个人自主挂牌房源”线上新渠道,中国网地产,2021-08-21 06:42:50
+奥园健康：上半年归属股东净利润1.77亿元 同比增60.16%,中国网地产,2021-08-21 06:06:23
+亿达中国：上半年归属股东净利润2.71亿元 同比降8.8%,中国网地产,2021-08-21 06:05:13
+三盛控股：上半年归属股东净利润6.25亿元 同比增加293.1%,中国网地产,2021-08-21 06:04:26
+中国奥园：上半年归属股东净利润20.88亿元,中国网地产,2021-08-21 06:03:16
+喜讯 | 阿拓拉斯荣获ELA国际景观大奖,阿拓拉斯（中国）规划·设计,2021-08-16 15:34:22
+在研究 | 从地图看一个华北农村二十多年的变化,在建筑,2021-08-16 10:00:03
+蓝天组Wolf D. Prix：从解构主义到人工智能的“两日之遥”,构筑空间,2021-07-30 10:41:51
+社会住宅不是低品质社区的代名词，低收入者也有享受高质量建筑空间的权利,URBANUS都市实践,2021-07-30 09:22:52
+LA聚焦 | 李雄：公园体检——助力城市公园系统更新,风景园林杂志,2021-07-07 11:12:07
+李忠观点丨紧抓数字游牧民，兑现海岛科技价值,华高莱斯,2021-06-23 17:29:32
+朗诗绿色生活：上半年净利润1444万元 同比减少1.4%,中国网地产,2021-08-21 06:02:09
+上海市财政局：本市契税的适用税率为3%,中国网地产,2021-08-21 05:15:46
+股份过户已获批复 华建控股成为嘉凯城控股股东,中国网地产,2021-08-21 05:14:19
+业绩五年增长十倍 打造绿地企业新名片,中国网地产,2021-08-20 20:27:02
+Archdaily专访BIG出版三部曲,BIG建筑事务所,2021-08-20 17:55:15
+趋势预警|材料美学（下）,戴昆学习小组,2021-06-07 09:25:24
+上海新田360广场：浦东腹地的“品质生活社交场”,三益中国,2021-08-23 09:55:44
+趋势|造梦空间,戴昆学习小组,2021-08-23 09:24:06
--- a/案例30/scrapy.cfg
+++ b/案例30/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = mySpider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = mySpider