Final version

2024-05-07 16:17:43 -03:00 · 2024-05-07 16:17:43 -03:00 · 63e275fa2f
commit 63e275fa2f
parent d7fd3dd578
30 changed files with 2918 additions and 5 deletions
--- a/code/exercise-1.py
+++ b/code/exercise-1.py
@ -0,0 +1,21 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes"
+    allowed_domains = ["quotes.toscrape.com"]
+    start_urls = ["https://quotes.toscrape.com"]
+
+    def parse(self, response):
+        quotes = response.css(".quote")
+        for quote in quotes:
+            yield {
+                "quote": quote.css(".text::text").get(),
+                "author": quote.css(".author::text").get(),
+                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
+                "tags": quote.css(".tag *::text").getall(),
+            }
+
+        yield scrapy.Request(
+            response.urljoin(response.css(".next a::attr(href)").get())
+        )
--- a/code/exercise-2.py
+++ b/code/exercise-2.py
@ -0,0 +1,28 @@
+import scrapy
+
+
+class QuotesScrollSpider(scrapy.Spider):
+    name = "quotes_scroll"
+    allowed_domains = ["quotes.toscrape.com"]
+    api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
+
+    def start_requests(self):
+        yield scrapy.Request(self.api_url.format(page=1))
+
+    def parse(self, response):
+        data = response.json()
+        current_page = data.get("page")
+        for quote in data.get("quotes"):
+            yield {
+                "quote": quote.get("text"),
+                "author": quote.get("author").get("name"),
+                "author_url": response.urljoin(
+                    quote.get("author").get("goodreads_link")
+                ),
+                "tags": quote.get("tags"),
+            }
+        if data.get("has_next"):
+            next_page = current_page + 1
+            yield scrapy.Request(
+                self.api_url.format(page=next_page),
+            )
--- a/code/exercise-3.py
+++ b/code/exercise-3.py
@ -0,0 +1,24 @@
+import json
+import scrapy
+
+
+class QuotesJSSpider(scrapy.Spider):
+    name = "quotes_js"
+    allowed_domains = ["quotes.toscrape.com"]
+    start_urls = ["https://quotes.toscrape.com/js/"]
+
+    def parse(self, response):
+        raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
+        quotes = json.loads(raw_quotes)
+        for quote in quotes:
+            yield {
+                "quote": quote.get("text"),
+                "author": quote.get("author").get("name"),
+                "author_url": response.urljoin(
+                    quote.get("author").get("goodreads_link")
+                ),
+                "tags": quote.get("tags"),
+            }
+        yield scrapy.Request(
+            response.urljoin(response.css(".next a::attr(href)").get())
+        )
--- a/code/exercise-4.py
+++ b/code/exercise-4.py
@ -0,0 +1,36 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes_complete"
+    allowed_domains = ["quotes.toscrape.com"]
+    start_urls = ["https://quotes.toscrape.com"]
+
+    def parse(self, response):
+        quotes = response.css(".quote")
+        for quote in quotes:
+            about_url = response.urljoin(quote.css("span a::attr(href)").get())
+
+            quote_info = {
+                "quote": quote.css(".text::text").get(),
+                "author": quote.css(".author::text").get(),
+                "author_url": about_url,
+                "tags": quote.css(".tag *::text").getall(),
+            }
+
+            yield scrapy.Request(
+                about_url,
+                callback=self.parse_about_page,
+                meta={"quote_info": quote_info},
+                dont_filter=True,
+            )
+
+        yield scrapy.Request(
+            response.urljoin(response.css(".next a::attr(href)").get()),
+        )
+
+    def parse_about_page(self, response):
+        quote = response.meta["quote_info"]
+        author_born_date = response.css(".author-born-date::text").get()
+        quote["author_born_date"] = author_born_date
+        yield quote
--- a/code/playwright-disabled.py
+++ b/code/playwright-disabled.py
@ -0,0 +1,27 @@
+import scrapy
+
+
+class QuotesPlaywrightSpider(scrapy.Spider):
+    name = "quotes-playwright"
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "PLAYWRIGHT_LAUNCH_OPTIONS": {
+            "headless": True,
+        }
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="http://quotes.toscrape.com/js/",
+            meta={
+                "playwright": False,
+            },
+        )
+
+    async def parse(self, response):
+        with open("playwright-disabled.html", "w") as content:
+            content.write(response.text)
--- a/code/playwright-enabled.py
+++ b/code/playwright-enabled.py
@ -0,0 +1,27 @@
+import scrapy
+
+
+class QuotesPlaywrightSpider(scrapy.Spider):
+    name = "quotes-playwright"
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "PLAYWRIGHT_LAUNCH_OPTIONS": {
+            "headless": True,
+        },
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="http://quotes.toscrape.com/js/",
+            meta={
+                "playwright": True,
+            },
+        )
+
+    async def parse(self, response):
+        with open("playwright-enabled.html", "w") as content:
+            content.write(response.text)
--- a/code/playwright-quotes.py
+++ b/code/playwright-quotes.py
@ -0,0 +1,40 @@
+import scrapy
+
+
+class QuotesPlaywrightSpider(scrapy.Spider):
+    name = "quotes-playwright"
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "PLAYWRIGHT_LAUNCH_OPTIONS": {
+            "headless": True,
+        },
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="http://quotes.toscrape.com/js/",
+            meta={
+                "playwright": True,
+            },
+        )
+
+    async def parse(self, response):
+        quotes = response.css(".quote")
+        for quote in quotes:
+            yield {
+                "quote": quote.css(".text::text").get(),
+                "author": quote.css(".author::text").get(),
+                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
+                "tags": quote.css(".tag *::text").getall(),
+            }
+
+        yield scrapy.Request(
+            response.urljoin(response.css(".next a::attr(href)").get()),
+            meta={
+                "playwright": True,
+            },
+        )
--- a/code/pyconus2024-css.py
+++ b/code/pyconus2024-css.py
@ -0,0 +1,17 @@
+import scrapy
+
+
+class PyConUS2024Spider(scrapy.Spider):
+    name = "pyconus"
+
+    start_urls = [
+        "https://us.pycon.org/2024/schedule/tutorials/",
+    ]
+
+    def parse(self, response):
+        for tutorial in response.css(".presentation"):
+            yield {
+                "speaker": tutorial.css(".speaker::text").get().strip(),
+                "url": response.urljoin(tutorial.css(".title a::attr(href)").get()),
+                "title": tutorial.css(".title a::text").get(),
+            }
--- a/code/pyconus2024-tutorials-requests.py
+++ b/code/pyconus2024-tutorials-requests.py
@ -1,8 +1,8 @@
 import requests
 from parsel import Selector

-response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
+response = requests.get("https://us.pycon.org/2024/schedule/tutorials/")

 sel = Selector(text=response.text)
-for tutorial in sel.css('.calendar a::text').getall():
-    print(tutorial)
+for tutorial in sel.css(".calendar a::text").getall():
+    print(tutorial)
--- a/code/pyconus2024-xpath-and-css.py
+++ b/code/pyconus2024-xpath-and-css.py
@ -0,0 +1,19 @@
+import scrapy
+
+
+class PyConUS2024Spider(scrapy.Spider):
+    name = "pyconus"
+
+    start_urls = [
+        "https://us.pycon.org/2024/schedule/tutorials/",
+    ]
+
+    def parse(self, response):
+        for tutorial in response.xpath('//div[@class="presentation"]'):
+            yield {
+                "speaker": tutorial.xpath('./div[@class="speaker"]/text()')
+                .get()
+                .strip(),
+                "url": response.urljoin(tutorial.xpath(".//a/@href").get()),
+                "title": tutorial.xpath(".//a/text()").get(),
+            }
--- a/code/pyconus2024-xpath.py
+++ b/code/pyconus2024-xpath.py
@ -0,0 +1,19 @@
+import scrapy
+
+
+class PyConUS2024Spider(scrapy.Spider):
+    name = "pyconus"
+
+    start_urls = [
+        "https://us.pycon.org/2024/schedule/tutorials/",
+    ]
+
+    def parse(self, response):
+        for tutorial in response.xpath('//div[@class="presentation"]'):
+            yield {
+                "speaker": tutorial.xpath('./div[@class="speaker"]/text()')
+                .get()
+                .strip(),
+                "url": response.urljoin(tutorial.xpath(".//a/@href").get()),
+                "title": tutorial.xpath(".//a/text()").get(),
+            }
--- a/code/pyconus2024.py
+++ b/code/pyconus2024.py
@ -1,12 +1,13 @@
 import scrapy

+
 class PyConUS2024Spider(scrapy.Spider):
    name = "pyconus"

    start_urls = [
-        'https://us.pycon.org/2024/schedule/tutorials/',
+        "https://us.pycon.org/2024/schedule/tutorials/",
    ]

    def parse(self, response):
-        for tutorial in response.css('.calendar a::text').getall():
+        for tutorial in response.css(".calendar a::text").getall():
            yield {"title": tutorial}