Reorganize example code

This commit is contained in:
Renne Rocha 2024-05-12 10:18:16 -03:00
parent 7c10d1c4b0
commit 6580c266dd
8 changed files with 143 additions and 56 deletions

View file

@ -7,15 +7,9 @@ class QuotesSpider(scrapy.Spider):
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
# 1. Get the list of quotes availabe at the page
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)
# 2. Parse each quote found and yield the quote item
# 3. Follow the next page link
...

View file

@ -4,25 +4,13 @@ import scrapy
class QuotesScrollSpider(scrapy.Spider):
name = "quotes_scroll"
allowed_domains = ["quotes.toscrape.com"]
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
def start_requests(self):
yield scrapy.Request(self.api_url.format(page=1))
# What would be a good first request for this spider?
...
def parse(self, response):
# API response is a JSON content
data = response.json()
current_page = data.get("page")
for quote in data.get("quotes"):
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
self.api_url.format(page=next_page),
)
# Parse the data here

View file

@ -8,17 +8,9 @@ class QuotesJSSpider(scrapy.Spider):
start_urls = ["https://quotes.toscrape.com/js/"]
def parse(self, response):
# 1. Find the raw data inside the HTML
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
quotes = json.loads(raw_quotes)
for quote in quotes:
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)
# 2. With the raw data, convert it to Python and parse it
# 3. Don't forget we have pagination here too

View file

@ -7,30 +7,28 @@ class QuotesSpider(scrapy.Spider):
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
# This exercise is an improvement of Exercise 1
# so you could use it as a start point
quotes = response.css(".quote")
for quote in quotes:
about_url = response.urljoin(quote.css("span a::attr(href)").get())
quote_info = {
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": about_url,
# The remaining data that we want requires that we gather data
# from the content of this URL
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
about_url,
callback=self.parse_about_page,
meta={"quote_info": quote_info},
dont_filter=True,
)
# How to send the partially filled item to a new page?
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get()),
response.urljoin(response.css(".next a::attr(href)").get())
)
def parse_about_page(self, response):
quote = response.meta["quote_info"]
author_born_date = response.css(".author-born-date::text").get()
quote["author_born_date"] = author_born_date
yield quote
# We need to parse about page as well
...

View file

@ -0,0 +1,21 @@
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)

View file

@ -0,0 +1,31 @@
import scrapy
class QuotesScrollSpider(scrapy.Spider):
name = "quotes_scroll"
allowed_domains = ["quotes.toscrape.com"]
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
def start_requests(self):
yield scrapy.Request(self.api_url.format(page=1))
def parse(self, response):
data = response.json()
for quote in data.get("quotes"):
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
current_page = data.get("page")
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
self.api_url.format(page=next_page),
)

View file

@ -0,0 +1,27 @@
import json
import scrapy
class QuotesJSSpider(scrapy.Spider):
name = "quotes_js"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com/js/"]
def parse(self, response):
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
quotes = json.loads(raw_quotes)
for quote in quotes:
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)

View file

@ -0,0 +1,36 @@
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes_complete"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
about_url = response.urljoin(quote.css("span a::attr(href)").get())
quote_info = {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": about_url,
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
about_url,
callback=self.parse_about_page,
meta={"quote_info": quote_info},
dont_filter=True,
)
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get()),
)
def parse_about_page(self, response):
quote = response.meta["quote_info"]
author_born_date = response.css(".author-born-date::text").get()
quote["author_born_date"] = author_born_date
yield quote