提交 96569ca0 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

建筑档案网站采集

上级 27c82d81
from scrapy import cmdline
cmdline.execute(("scrapy crawl jianzhu").split())
\ No newline at end of file
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()
# 发布人
userName = scrapy.Field()
# 发布时间
createTime = scrapy.Field()
\ No newline at end of file
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class MyspiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyspiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import os
import csv
class MyspiderPipeline:
def __init__(self):
# csv 文件
store_file = os.path.dirname(__file__)+"/spiders/school1.csv"
self.file = open(store_file,"a+",newline='',encoding="utf-8")
self.writer = csv.writer(self.file)
def process_item(self, item, spider):
try:
self.writer.writerow((
item["title"],
item["userName"],
item["createTime"]
))
except Exception as e:
print(e.args)
def close_spider(self,spider):
self.file.close()
# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'mySpider'
SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mySpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'mySpider.pipelines.MyspiderPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from scrapy import FormRequest
import json
from items import MyspiderItem
class JianshekuSpider(scrapy.Spider):
name = 'jianzhu'
allowed_domains = ['admin.jzda001.com']
start_url = 'https://admin.jzda001.com/api/core/002--newsList'
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "hhttps://www.baidu.com/xyzg/"
}
# 需要重写 start_requests() 方法
def start_requests(self):
# 只获取 20 页数据
for page in range(1, 5):
form_data = {
"type": "1",
"limit": "17",
"pageNo": str(page)
}
request = FormRequest(
self.start_url, headers=self.headers, formdata=form_data, callback=self.parse)
yield request
def parse(self, response):
# print(response.body)
# print(response.url)
# print(response.body_as_unicode())
data = json.loads(response.body_as_unicode())
data = data["rows"] # 获取数据
print(data)
for item in data:
school = MyspiderItem()
school["title"] = item["title"]
school["userName"] = item["userName"]
school["createTime"] = item["createTime"]
# 将获取的数据交给pipelines,pipelines在settings.py中定义
yield school
弘阳服务:上半年归属股东净利润5983万元 同比增156%,中国网地产,2021-08-19 22:53:48
灵感库丨城市中消失的“围墙”,让空间连接生活,Roca Gallery乐家艺术廊,2021-08-19 18:38:13
【建言︱对话大桥谕】:延续扎哈设计思想,探索未来城市,Roca Gallery乐家艺术廊,2021-08-19 18:38:08
半年报快读|危中有机 “零踩线”越秀地产稳中求进,中国网地产,2021-08-19 15:14:06
John Gillen荣升Aedas执行董事,Aedas,2021-08-19 14:30:55
喜讯丨四项由Aedas设计的项目荣膺2021年国际建筑大奖,Aedas,2021-08-19 14:30:43
Aedas设计的香港西贡WM酒店携手2021香港小姐美丽同行!,Aedas,2021-08-19 14:30:30
李忠观点丨从东亚文化消费发展,看中国国潮崛起(三),华高莱斯,2021-08-19 11:43:23
喜讯丨承构建筑荣获两项法国Novum Design Award建筑设计类别最高奖项,承构建筑,2021-08-19 09:28:40
新时代的科学城:你应该了解的6大命题!,TOP创新区研究院,2021-08-19 09:27:34
此心安处 | 海口罗牛山玖悦台 · 金樾府 铂樾府,奥雅设计LA-2013,2021-08-19 09:27:05
趋势|芒果雪糕色,戴昆学习小组,2021-08-19 09:24:38
优客工场企业服务生态集群之 | 省广众烁:打造低成本高品质的数字整合营销服务,优客工场ucommune,2021-08-19 09:24:07
半年报解读丨时代中国:上半年收入降至136.38亿元 非控股股东权益利润分配成谜,中国网地产,2021-08-19 00:09:43
卓越商企服务:预计上半年归属股东净利润同比增55%,中国网地产,2021-08-19 00:08:57
“玺悦相逢 致敬望京” 2021望京国际化发展主题峰会周六启幕,中国网地产,2021-08-20 13:16:00
喜讯 | GLC两项作品入围英国SBID国际设计大奖Finalist,投票启动!,GLC(中国),2021-08-21 20:14:48
建业地产:2021年上半年归属股东净利润7.29亿元 同比上升0.3%,中国网地产,2021-08-19 00:08:03
宝龙地产:上半年归属股东净利润39.08亿元 同比上升约76.1%,中国网地产,2021-08-19 00:07:00
筑土分享丨联合国第六次气候变化报告发布:地球向人类发出的“红色警报”,筑土国际,2021-08-20 11:27:49
UNStudio「索契海滨包容性规划设计」被评选为获胜方案,UNStudio,2021-08-20 10:58:13
澳大利亚墨尔本南岸项目中标三周年!,UNStudio,2021-08-20 10:57:04
"佳期如约, 荣光而至 | 2021年中项目荣誉合集",致逸设计,2021-08-20 09:24:56
做产业创新区的五大雷区,TOP创新区研究院,2021-08-20 09:17:56
金科服务:上半年归属股东净利润5.29亿元 同比增80.3%,中国网地产,2021-08-19 23:03:43
华发物业服务:预计上半年归属股东净利润同比增长超400%,中国网地产,2021-08-19 23:03:03
中国宏泰发展:上半年归属股东净利润1.03亿元 同比减少82.23%,中国网地产,2021-08-19 23:02:09
南国置业:上半年归属股东净亏损4.23亿元,中国网地产,2021-08-19 23:00:21
远洋集团:上半年归属股东净利润10.10亿元,中国网地产,2021-08-19 22:59:12
半年报快读|半年收租8亿元 “包租公”SOHO中国的生意场,中国网地产,2021-08-19 22:58:29
华发股份:上半年归属股东净利润16.87亿元 同比增15.03%,中国网地产,2021-08-19 22:57:46
南京高科:上半年归属股东净利润14.32亿元 同比增长0.68%,中国网地产,2021-08-19 22:56:36
荣盛发展:上半年归属股东净利润25.32亿元,中国网地产,2021-08-19 22:55:46
复星旅游文化:上半年归属股东净亏损20.04亿元,中国网地产,2021-08-19 22:55:09
商办市场高质量找房网站,速读网六大优势了解一下,优客工场ucommune,2021-08-23 09:23:36
行村设计 | 藏如明信片般的四季风景 202村温泉民宿,建筑档案,2021-08-20 18:24:23
谈加薪,先要一杯水;谈离职,要杯星巴克。,那小海,2021-08-20 18:02:47
新书出版 |《SOM 作品精选》系列丛书最新版:收录过去十年最脍炙人口的设计作品,SOM设计事务所,2021-08-20 17:17:31
UNStudio赢得韩国忠南美术馆设计竞赛,UNStudio,2021-08-20 10:58:26
UNStudio 亚洲工作室 | 建成项目,UNStudio,2021-08-18 14:31:26
重磅嘉宾阵容公布,五大前沿主题蓄势待发 | 2021上海国际建筑文化周,建筑档案,2021-08-18 10:09:07
独家 | 成都仁恒置地广场将如何改造?,伍兹贝格建筑设计事务所,2021-08-17 09:37:27
佳兆业美好:上半年归属股东净利润2.09亿元 同比增75.6%,中国网地产,2021-08-19 22:54:29
政策篇 | 严禁高杠杆企业结构化发债,多地新房销售政策趋严, 建诚晟业,2021-08-23 09:20:11
GLC | 作品合集,GLC(中国),2021-08-21 20:15:26
北京住建委:严禁样板间“货不对板”;华润置地前7月合同销售金额约1878.1亿元丨地产财经早餐,中国网地产,2021-08-21 06:45:03
北京前7月商品房销售面积603.6万平方米;中海物业上半年归属股东净利润3.93亿港元丨地产财经早餐,中国网地产,2021-08-21 06:44:55
北京加强公租房资格复核及分配管理;雅居乐上半年归属股东净利润52.9亿元丨地产财经早餐,中国网地产,2021-08-21 06:44:37
河北严禁无证认购、认筹等变相售房行为;荣盛发展上半年归属股东净利润25.32亿元丨地产财经早餐,中国网地产,2021-08-21 06:43:20
杭州推出“个人自主挂牌房源”线上新渠道,中国网地产,2021-08-21 06:42:50
奥园健康:上半年归属股东净利润1.77亿元 同比增60.16%,中国网地产,2021-08-21 06:06:23
亿达中国:上半年归属股东净利润2.71亿元 同比降8.8%,中国网地产,2021-08-21 06:05:13
三盛控股:上半年归属股东净利润6.25亿元 同比增加293.1%,中国网地产,2021-08-21 06:04:26
中国奥园:上半年归属股东净利润20.88亿元,中国网地产,2021-08-21 06:03:16
喜讯 | 阿拓拉斯荣获ELA国际景观大奖,阿拓拉斯(中国)规划·设计,2021-08-16 15:34:22
在研究 | 从地图看一个华北农村二十多年的变化,在建筑,2021-08-16 10:00:03
蓝天组Wolf D. Prix:从解构主义到人工智能的“两日之遥”,构筑空间,2021-07-30 10:41:51
社会住宅不是低品质社区的代名词,低收入者也有享受高质量建筑空间的权利,URBANUS都市实践,2021-07-30 09:22:52
LA聚焦 | 李雄:公园体检——助力城市公园系统更新,风景园林杂志,2021-07-07 11:12:07
李忠观点丨紧抓数字游牧民,兑现海岛科技价值,华高莱斯,2021-06-23 17:29:32
朗诗绿色生活:上半年净利润1444万元 同比减少1.4%,中国网地产,2021-08-21 06:02:09
上海市财政局:本市契税的适用税率为3%,中国网地产,2021-08-21 05:15:46
股份过户已获批复 华建控股成为嘉凯城控股股东,中国网地产,2021-08-21 05:14:19
业绩五年增长十倍 打造绿地企业新名片,中国网地产,2021-08-20 20:27:02
Archdaily专访BIG出版三部曲,BIG建筑事务所,2021-08-20 17:55:15
趋势预警|材料美学(下),戴昆学习小组,2021-06-07 09:25:24
上海新田360广场:浦东腹地的“品质生活社交场”,三益中国,2021-08-23 09:55:44
趋势|造梦空间,戴昆学习小组,2021-08-23 09:24:06
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = mySpider.settings
[deploy]
#url = http://localhost:6800/
project = mySpider
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册