diff --git a/code/exercise-1.py b/code/exercise-1.py index 1b0a0fd..2cfb4d9 100644 --- a/code/exercise-1.py +++ b/code/exercise-1.py @@ -7,15 +7,9 @@ class QuotesSpider(scrapy.Spider): start_urls = ["https://quotes.toscrape.com"] def parse(self, response): - quotes = response.css(".quote") - for quote in quotes: - yield { - "quote": quote.css(".text::text").get(), - "author": quote.css(".author::text").get(), - "author_url": response.urljoin(quote.css("span a::attr(href)").get()), - "tags": quote.css(".tag *::text").getall(), - } + # 1. Get the list of quotes availabe at the page - yield scrapy.Request( - response.urljoin(response.css(".next a::attr(href)").get()) - ) + # 2. Parse each quote found and yield the quote item + + # 3. Follow the next page link + ... \ No newline at end of file diff --git a/code/exercise-2.py b/code/exercise-2.py index 8bafc73..54182e1 100644 --- a/code/exercise-2.py +++ b/code/exercise-2.py @@ -4,25 +4,13 @@ import scrapy class QuotesScrollSpider(scrapy.Spider): name = "quotes_scroll" allowed_domains = ["quotes.toscrape.com"] - api_url = "https://quotes.toscrape.com/api/quotes?page={page}" def start_requests(self): - yield scrapy.Request(self.api_url.format(page=1)) + # What would be a good first request for this spider? + ... def parse(self, response): + # API response is a JSON content data = response.json() - current_page = data.get("page") - for quote in data.get("quotes"): - yield { - "quote": quote.get("text"), - "author": quote.get("author").get("name"), - "author_url": response.urljoin( - quote.get("author").get("goodreads_link") - ), - "tags": quote.get("tags"), - } - if data.get("has_next"): - next_page = current_page + 1 - yield scrapy.Request( - self.api_url.format(page=next_page), - ) + + # Parse the data here \ No newline at end of file diff --git a/code/exercise-3.py b/code/exercise-3.py index b75c1ac..8b25440 100644 --- a/code/exercise-3.py +++ b/code/exercise-3.py @@ -8,17 +8,9 @@ class QuotesJSSpider(scrapy.Spider): start_urls = ["https://quotes.toscrape.com/js/"] def parse(self, response): + # 1. Find the raw data inside the HTML raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));") - quotes = json.loads(raw_quotes) - for quote in quotes: - yield { - "quote": quote.get("text"), - "author": quote.get("author").get("name"), - "author_url": response.urljoin( - quote.get("author").get("goodreads_link") - ), - "tags": quote.get("tags"), - } - yield scrapy.Request( - response.urljoin(response.css(".next a::attr(href)").get()) - ) + + # 2. With the raw data, convert it to Python and parse it + + # 3. Don't forget we have pagination here too \ No newline at end of file diff --git a/code/exercise-4.py b/code/exercise-4.py index 66d4471..804d6ab 100644 --- a/code/exercise-4.py +++ b/code/exercise-4.py @@ -7,30 +7,28 @@ class QuotesSpider(scrapy.Spider): start_urls = ["https://quotes.toscrape.com"] def parse(self, response): + # This exercise is an improvement of Exercise 1 + # so you could use it as a start point quotes = response.css(".quote") for quote in quotes: - about_url = response.urljoin(quote.css("span a::attr(href)").get()) - - quote_info = { + yield { "quote": quote.css(".text::text").get(), "author": quote.css(".author::text").get(), - "author_url": about_url, + + # The remaining data that we want requires that we gather data + # from the content of this URL + "author_url": response.urljoin(quote.css("span a::attr(href)").get()), + "tags": quote.css(".tag *::text").getall(), } - yield scrapy.Request( - about_url, - callback=self.parse_about_page, - meta={"quote_info": quote_info}, - dont_filter=True, - ) + # How to send the partially filled item to a new page? yield scrapy.Request( - response.urljoin(response.css(".next a::attr(href)").get()), + response.urljoin(response.css(".next a::attr(href)").get()) ) + def parse_about_page(self, response): - quote = response.meta["quote_info"] - author_born_date = response.css(".author-born-date::text").get() - quote["author_born_date"] = author_born_date - yield quote + # We need to parse about page as well + ... \ No newline at end of file diff --git a/code/exercise-solutions/exercise-1.py b/code/exercise-solutions/exercise-1.py new file mode 100644 index 0000000..1b0a0fd --- /dev/null +++ b/code/exercise-solutions/exercise-1.py @@ -0,0 +1,21 @@ +import scrapy + + +class QuotesSpider(scrapy.Spider): + name = "quotes" + allowed_domains = ["quotes.toscrape.com"] + start_urls = ["https://quotes.toscrape.com"] + + def parse(self, response): + quotes = response.css(".quote") + for quote in quotes: + yield { + "quote": quote.css(".text::text").get(), + "author": quote.css(".author::text").get(), + "author_url": response.urljoin(quote.css("span a::attr(href)").get()), + "tags": quote.css(".tag *::text").getall(), + } + + yield scrapy.Request( + response.urljoin(response.css(".next a::attr(href)").get()) + ) diff --git a/code/exercise-solutions/exercise-2.py b/code/exercise-solutions/exercise-2.py new file mode 100644 index 0000000..49235d8 --- /dev/null +++ b/code/exercise-solutions/exercise-2.py @@ -0,0 +1,31 @@ +import scrapy + + +class QuotesScrollSpider(scrapy.Spider): + name = "quotes_scroll" + allowed_domains = ["quotes.toscrape.com"] + api_url = "https://quotes.toscrape.com/api/quotes?page={page}" + + def start_requests(self): + yield scrapy.Request(self.api_url.format(page=1)) + + def parse(self, response): + data = response.json() + + for quote in data.get("quotes"): + yield { + "quote": quote.get("text"), + "author": quote.get("author").get("name"), + "author_url": response.urljoin( + quote.get("author").get("goodreads_link") + ), + "tags": quote.get("tags"), + } + + current_page = data.get("page") + + if data.get("has_next"): + next_page = current_page + 1 + yield scrapy.Request( + self.api_url.format(page=next_page), + ) diff --git a/code/exercise-solutions/exercise-3.py b/code/exercise-solutions/exercise-3.py new file mode 100644 index 0000000..efb2e1a --- /dev/null +++ b/code/exercise-solutions/exercise-3.py @@ -0,0 +1,27 @@ +import json +import scrapy + + +class QuotesJSSpider(scrapy.Spider): + name = "quotes_js" + allowed_domains = ["quotes.toscrape.com"] + start_urls = ["https://quotes.toscrape.com/js/"] + + def parse(self, response): + raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));") + + quotes = json.loads(raw_quotes) + + for quote in quotes: + yield { + "quote": quote.get("text"), + "author": quote.get("author").get("name"), + "author_url": response.urljoin( + quote.get("author").get("goodreads_link") + ), + "tags": quote.get("tags"), + } + + yield scrapy.Request( + response.urljoin(response.css(".next a::attr(href)").get()) + ) diff --git a/code/exercise-solutions/exercise-4.py b/code/exercise-solutions/exercise-4.py new file mode 100644 index 0000000..66d4471 --- /dev/null +++ b/code/exercise-solutions/exercise-4.py @@ -0,0 +1,36 @@ +import scrapy + + +class QuotesSpider(scrapy.Spider): + name = "quotes_complete" + allowed_domains = ["quotes.toscrape.com"] + start_urls = ["https://quotes.toscrape.com"] + + def parse(self, response): + quotes = response.css(".quote") + for quote in quotes: + about_url = response.urljoin(quote.css("span a::attr(href)").get()) + + quote_info = { + "quote": quote.css(".text::text").get(), + "author": quote.css(".author::text").get(), + "author_url": about_url, + "tags": quote.css(".tag *::text").getall(), + } + + yield scrapy.Request( + about_url, + callback=self.parse_about_page, + meta={"quote_info": quote_info}, + dont_filter=True, + ) + + yield scrapy.Request( + response.urljoin(response.css(".next a::attr(href)").get()), + ) + + def parse_about_page(self, response): + quote = response.meta["quote_info"] + author_born_date = response.css(".author-born-date::text").get() + quote["author_born_date"] = author_born_date + yield quote