# Define here the models for your spider middleware
#
# See documentation in: https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class CsdnSpiderMiddleware: class CsdnDownloaderMiddleware: # Scrapy settings for CSDN project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'CSDN'

SPIDER_MODULES = ['CSDN.spiders']
NEWSPIDER_MODULE = 'CSDN.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en',
    'referer': 'https://dream.blog.csdn.net/?type=sub&subType=fans'
}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'CSDN.pipelines.CsdnPipeline': 300,
} 总页码临时设置为 10 + for page in range(2,11): + print("正在爬取{}页".format(page),end="") + yield Request("https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=hihell".format(str(page)), callback=self.parse_item) + + def parse_item(self,response): + + data = json.loads(response.body_as_unicode()) + print("*"*100) + item = CsdnItem() + for one_item in data["data"]["list"]: + item["username"] = one_item["username"] + item["blogUrl"] = one_item["blogUrl"] + yield item \ No newline at end of file diff --git "a/\346\241\210\344\276\21331/CSDN/spiders/__init__.py" "b/\346\241\210\344\276\21331/CSDN/spiders/__init__.py" new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ "b/\346\241\210\344\276\21331/CSDN/spiders/__init__.py" @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git "a/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/C.cpython-38.pyc" "b/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/C.cpython-38.pyc" new file mode 100644 index 0000000000000000000000000000000000000000..4ba2a0eca2eb750d46d4078e466c0d40dfcb9c92 Binary files /dev/null and "b/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/C.cpython-38.pyc" differ diff --git "a/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/__init__.cpython-38.pyc" "b/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/__init__.cpython-38.pyc" new file mode 100644 index 0000000000000000000000000000000000000000..f05595e0f36c9600c1fb2ecaed8319b26ebf7160 Binary files /dev/null and "b/\346\241\210\344\276\21331/CSDN/spiders/__pycache__/__init__.cpython-38.pyc" differ diff --git "a/\346\241\210\344\276\21331/scrapy.cfg" "b/\346\241\210\344\276\21331/scrapy.cfg" new file mode 100644 index 0000000000000000000000000000000000000000..06892ea5a3183d583572e90de08aef2b740ab45f --- /dev/null +++ "b/\346\241\210\344\276\21331/scrapy.cfg" @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = CSDN.settings + +[deploy] +#url = http://localhost:6800/ +project = CSDN