Reorganize example code

2024-05-12 10:18:16 -03:00 · 2024-05-12 10:18:16 -03:00 · 6580c266dd
commit 6580c266dd
parent 7c10d1c4b0
8 changed files with 143 additions and 56 deletions
--- a/code/exercise-1.py
+++ b/code/exercise-1.py
@ -7,15 +7,9 @@ class QuotesSpider(scrapy.Spider):
    start_urls = ["https://quotes.toscrape.com"]
    def parse(self, response):
-        quotes = response.css(".quote")
+        # 1. Get the list of quotes availabe at the page
        for quote in quotes:
            yield {
                "quote": quote.css(".text::text").get(),
                "author": quote.css(".author::text").get(),
                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
                "tags": quote.css(".tag *::text").getall(),
            }
-        yield scrapy.Request(
+        # 2. Parse each quote found and yield the quote item
-            response.urljoin(response.css(".next a::attr(href)").get())
+
-        )
+        # 3. Follow the next page link
        ...
--- a/code/exercise-2.py
+++ b/code/exercise-2.py
@ -4,25 +4,13 @@ import scrapy
 class QuotesScrollSpider(scrapy.Spider):
    name = "quotes_scroll"
    allowed_domains = ["quotes.toscrape.com"]
    api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
    def start_requests(self):
-        yield scrapy.Request(self.api_url.format(page=1))
+        # What would be a good first request for this spider?
        ...
    def parse(self, response):
        # API response is a JSON content
        data = response.json()
-        current_page = data.get("page")
+
-        for quote in data.get("quotes"):
+        # Parse the data here
            yield {
                "quote": quote.get("text"),
                "author": quote.get("author").get("name"),
                "author_url": response.urljoin(
                    quote.get("author").get("goodreads_link")
                ),
                "tags": quote.get("tags"),
            }
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
                self.api_url.format(page=next_page),
            )
--- a/code/exercise-3.py
+++ b/code/exercise-3.py
@ -8,17 +8,9 @@ class QuotesJSSpider(scrapy.Spider):
    start_urls = ["https://quotes.toscrape.com/js/"]
    def parse(self, response):
        # 1. Find the raw data inside the HTML
        raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
-        quotes = json.loads(raw_quotes)
+
-        for quote in quotes:
+        # 2. With the raw data, convert it to Python and parse it
-            yield {
+
-                "quote": quote.get("text"),
+        # 3. Don't forget we have pagination here too
                "author": quote.get("author").get("name"),
                "author_url": response.urljoin(
                    quote.get("author").get("goodreads_link")
                ),
                "tags": quote.get("tags"),
            }
        yield scrapy.Request(
            response.urljoin(response.css(".next a::attr(href)").get())
        )
--- a/code/exercise-4.py
+++ b/code/exercise-4.py
@ -7,30 +7,28 @@ class QuotesSpider(scrapy.Spider):
    start_urls = ["https://quotes.toscrape.com"]
    def parse(self, response):
        # This exercise is an improvement of Exercise 1
        # so you could use it as a start point
        quotes = response.css(".quote")
        for quote in quotes:
-            about_url = response.urljoin(quote.css("span a::attr(href)").get())
+            yield {
            quote_info = {
                "quote": quote.css(".text::text").get(),
                "author": quote.css(".author::text").get(),
-                "author_url": about_url,
+
                # The remaining data that we want requires that we gather data
                # from the content of this URL
                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
                "tags": quote.css(".tag *::text").getall(),
            }
-            yield scrapy.Request(
+            # How to send the partially filled item to a new page?
                about_url,
                callback=self.parse_about_page,
                meta={"quote_info": quote_info},
                dont_filter=True,
            )
        yield scrapy.Request(
-            response.urljoin(response.css(".next a::attr(href)").get()),
+            response.urljoin(response.css(".next a::attr(href)").get())
        )
    def parse_about_page(self, response):
-        quote = response.meta["quote_info"]
+        # We need to parse about page as well
-        author_born_date = response.css(".author-born-date::text").get()
+        ...
        quote["author_born_date"] = author_born_date
        yield quote
--- a/code/exercise-solutions/exercise-1.py
+++ b/code/exercise-solutions/exercise-1.py
@ -0,0 +1,21 @@
 import scrapy
 class QuotesSpider(scrapy.Spider):
    name = "quotes"
    allowed_domains = ["quotes.toscrape.com"]
    start_urls = ["https://quotes.toscrape.com"]
    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            yield {
                "quote": quote.css(".text::text").get(),
                "author": quote.css(".author::text").get(),
                "author_url": response.urljoin(quote.css("span a::attr(href)").get()),
                "tags": quote.css(".tag *::text").getall(),
            }
        yield scrapy.Request(
            response.urljoin(response.css(".next a::attr(href)").get())
        )
--- a/code/exercise-solutions/exercise-2.py
+++ b/code/exercise-solutions/exercise-2.py
@ -0,0 +1,31 @@
 import scrapy
 class QuotesScrollSpider(scrapy.Spider):
    name = "quotes_scroll"
    allowed_domains = ["quotes.toscrape.com"]
    api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
    def start_requests(self):
        yield scrapy.Request(self.api_url.format(page=1))
    def parse(self, response):
        data = response.json()
        for quote in data.get("quotes"):
            yield {
                "quote": quote.get("text"),
                "author": quote.get("author").get("name"),
                "author_url": response.urljoin(
                    quote.get("author").get("goodreads_link")
                ),
                "tags": quote.get("tags"),
            }
        current_page = data.get("page")
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
                self.api_url.format(page=next_page),
            )
--- a/code/exercise-solutions/exercise-3.py
+++ b/code/exercise-solutions/exercise-3.py
@ -0,0 +1,27 @@
 import json
 import scrapy
 class QuotesJSSpider(scrapy.Spider):
    name = "quotes_js"
    allowed_domains = ["quotes.toscrape.com"]
    start_urls = ["https://quotes.toscrape.com/js/"]
    def parse(self, response):
        raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
        quotes = json.loads(raw_quotes)
        for quote in quotes:
            yield {
                "quote": quote.get("text"),
                "author": quote.get("author").get("name"),
                "author_url": response.urljoin(
                    quote.get("author").get("goodreads_link")
                ),
                "tags": quote.get("tags"),
            }
        yield scrapy.Request(
            response.urljoin(response.css(".next a::attr(href)").get())
        )
--- a/code/exercise-solutions/exercise-4.py
+++ b/code/exercise-solutions/exercise-4.py
@ -0,0 +1,36 @@
 import scrapy
 class QuotesSpider(scrapy.Spider):
    name = "quotes_complete"
    allowed_domains = ["quotes.toscrape.com"]
    start_urls = ["https://quotes.toscrape.com"]
    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            about_url = response.urljoin(quote.css("span a::attr(href)").get())
            quote_info = {
                "quote": quote.css(".text::text").get(),
                "author": quote.css(".author::text").get(),
                "author_url": about_url,
                "tags": quote.css(".tag *::text").getall(),
            }
            yield scrapy.Request(
                about_url,
                callback=self.parse_about_page,
                meta={"quote_info": quote_info},
                dont_filter=True,
            )
        yield scrapy.Request(
            response.urljoin(response.css(".next a::attr(href)").get()),
        )
    def parse_about_page(self, response):
        quote = response.meta["quote_info"]
        author_born_date = response.css(".author-born-date::text").get()
        quote["author_born_date"] = author_born_date
        yield quote