Final version

This commit is contained in:
Renne Rocha 2024-05-07 16:17:43 -03:00
parent d7fd3dd578
commit 63e275fa2f
30 changed files with 2918 additions and 5 deletions

40
code/playwright-quotes.py Normal file
View file

@ -0,0 +1,40 @@
import scrapy
class QuotesPlaywrightSpider(scrapy.Spider):
name = "quotes-playwright"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
},
}
def start_requests(self):
yield scrapy.Request(
url="http://quotes.toscrape.com/js/",
meta={
"playwright": True,
},
)
async def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get()),
meta={
"playwright": True,
},
)