🤖依赖模块实现
爬虫模块
import scrapy
from myproject.items import URLItem
from store import storage
from Extractor import URLExtractor
from Recorder.Recorder import Recorder
import logging
import re
# from fake_useragent import UserAgent
import myproject.settings as mysetting
logtemp={
"name":"scrapy",
"type":"log",
"level":"info",
"messages":[]
}
class TestSpider(scrapy.Spider):
name = "test"
allowed_domains = []
start_urls = []
cookies={}
def __init__(self,project_name="",cookies=None, allowed_domains=None, start_urls=None, *args, **kwargs):
super(TestSpider, self).__init__(*args, **kwargs)
logging.info("初始化spider")
self.project_name=project_name
self.cookies=cookies
self.allowed_domains=allowed_domains
self.start_urls=start_urls
# 创建记录类
self.recorder=Recorder(project_name)
# 创建存储类
self.storage=storage.Storage_Base()
# 创建URL提取器
self.url_extractor=URLExtractor.URLProcessor(allow_domains=self.allowed_domains,start_urls=self.start_urls)
# 爬虫开始运行前
def start_requests(self):
# 在爬虫启动时执行的操作,可以在这里发送初始请求
print('爬虫运行中...')
log=logtemp
log["messages"]=[f'爬虫开始运行:\n\t{self.cookies}\n\t{self.start_urls}\n\t{self.allowed_domains}\n']
print(log["messages"])
self.recorder.log(log)
# 请求初始URL
for start_url in self.start_urls:
log["messages"]=[f'请求初始url:{start_url}']
self.recorder.log(log)
print(log["messages"])
yield scrapy.Request(url=start_url, cookies=self.cookies,callback=self.parse,)
def parse(self, response):
# 提取并去重url
try:
# print("提取url")
urls=self.url_extractor.get_urls(response) # 格式化后的URL
De_duplication_urls=self.url_extractor.De_duplication_url(urls) # 去重后的URL
if De_duplication_urls:
for url in De_duplication_urls:
print(url)
yield response.follow(url, cookies=self.cookies,callback=self.parse) # 通过此url继续发出请求
item=URLItem()
item["urls"]=De_duplication_urls
yield item # 移交给pipline转储url数据
except Exception as e:
logging.error('Extract_url:',e)提取模块
存储模块
最后更新于