Final version

2024-05-07 16:17:43 -03:00 · 2024-05-07 16:17:43 -03:00 · 63e275fa2f
commit 63e275fa2f
parent d7fd3dd578
30 changed files with 2918 additions and 5 deletions
--- a/code/playwright-quotes.py
+++ b/code/playwright-quotes.py
@ -0,0 +1,40 @@
+import scrapy
+
+
+class QuotesPlaywrightSpider(scrapy.Spider):
+    name = "quotes-playwright"
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "PLAYWRIGHT_LAUNCH_OPTIONS": {
+            "headless": True,
+        },
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="http://quotes.toscrape.com/js/",
+            meta={
+                "playwright": True,
+            },
+        )
+
+    async def parse(self, response):
+        quotes = response.css(".quote")
+        for quote in quotes:
+            yield {
+                "quote": quote.css(".text::text").get(),
+                "author": quote.css(".author::text").get(),
+                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
+                "tags": quote.css(".tag *::text").getall(),
+            }
+
+        yield scrapy.Request(
+            response.urljoin(response.css(".next a::attr(href)").get()),
+            meta={
+                "playwright": True,
+            },
+        )