Reorganize example code
This commit is contained in:
parent
7c10d1c4b0
commit
6580c266dd
8 changed files with 143 additions and 56 deletions
|
@ -7,15 +7,9 @@ class QuotesSpider(scrapy.Spider):
|
||||||
start_urls = ["https://quotes.toscrape.com"]
|
start_urls = ["https://quotes.toscrape.com"]
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
quotes = response.css(".quote")
|
# 1. Get the list of quotes availabe at the page
|
||||||
for quote in quotes:
|
|
||||||
yield {
|
|
||||||
"quote": quote.css(".text::text").get(),
|
|
||||||
"author": quote.css(".author::text").get(),
|
|
||||||
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
|
|
||||||
"tags": quote.css(".tag *::text").getall(),
|
|
||||||
}
|
|
||||||
|
|
||||||
yield scrapy.Request(
|
# 2. Parse each quote found and yield the quote item
|
||||||
response.urljoin(response.css(".next a::attr(href)").get())
|
|
||||||
)
|
# 3. Follow the next page link
|
||||||
|
...
|
|
@ -4,25 +4,13 @@ import scrapy
|
||||||
class QuotesScrollSpider(scrapy.Spider):
|
class QuotesScrollSpider(scrapy.Spider):
|
||||||
name = "quotes_scroll"
|
name = "quotes_scroll"
|
||||||
allowed_domains = ["quotes.toscrape.com"]
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
|
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
yield scrapy.Request(self.api_url.format(page=1))
|
# What would be a good first request for this spider?
|
||||||
|
...
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
# API response is a JSON content
|
||||||
data = response.json()
|
data = response.json()
|
||||||
current_page = data.get("page")
|
|
||||||
for quote in data.get("quotes"):
|
# Parse the data here
|
||||||
yield {
|
|
||||||
"quote": quote.get("text"),
|
|
||||||
"author": quote.get("author").get("name"),
|
|
||||||
"author_url": response.urljoin(
|
|
||||||
quote.get("author").get("goodreads_link")
|
|
||||||
),
|
|
||||||
"tags": quote.get("tags"),
|
|
||||||
}
|
|
||||||
if data.get("has_next"):
|
|
||||||
next_page = current_page + 1
|
|
||||||
yield scrapy.Request(
|
|
||||||
self.api_url.format(page=next_page),
|
|
||||||
)
|
|
|
@ -8,17 +8,9 @@ class QuotesJSSpider(scrapy.Spider):
|
||||||
start_urls = ["https://quotes.toscrape.com/js/"]
|
start_urls = ["https://quotes.toscrape.com/js/"]
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
# 1. Find the raw data inside the HTML
|
||||||
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
|
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
|
||||||
quotes = json.loads(raw_quotes)
|
|
||||||
for quote in quotes:
|
# 2. With the raw data, convert it to Python and parse it
|
||||||
yield {
|
|
||||||
"quote": quote.get("text"),
|
# 3. Don't forget we have pagination here too
|
||||||
"author": quote.get("author").get("name"),
|
|
||||||
"author_url": response.urljoin(
|
|
||||||
quote.get("author").get("goodreads_link")
|
|
||||||
),
|
|
||||||
"tags": quote.get("tags"),
|
|
||||||
}
|
|
||||||
yield scrapy.Request(
|
|
||||||
response.urljoin(response.css(".next a::attr(href)").get())
|
|
||||||
)
|
|
|
@ -7,30 +7,28 @@ class QuotesSpider(scrapy.Spider):
|
||||||
start_urls = ["https://quotes.toscrape.com"]
|
start_urls = ["https://quotes.toscrape.com"]
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
# This exercise is an improvement of Exercise 1
|
||||||
|
# so you could use it as a start point
|
||||||
quotes = response.css(".quote")
|
quotes = response.css(".quote")
|
||||||
for quote in quotes:
|
for quote in quotes:
|
||||||
about_url = response.urljoin(quote.css("span a::attr(href)").get())
|
yield {
|
||||||
|
|
||||||
quote_info = {
|
|
||||||
"quote": quote.css(".text::text").get(),
|
"quote": quote.css(".text::text").get(),
|
||||||
"author": quote.css(".author::text").get(),
|
"author": quote.css(".author::text").get(),
|
||||||
"author_url": about_url,
|
|
||||||
|
# The remaining data that we want requires that we gather data
|
||||||
|
# from the content of this URL
|
||||||
|
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
|
||||||
|
|
||||||
"tags": quote.css(".tag *::text").getall(),
|
"tags": quote.css(".tag *::text").getall(),
|
||||||
}
|
}
|
||||||
|
|
||||||
yield scrapy.Request(
|
# How to send the partially filled item to a new page?
|
||||||
about_url,
|
|
||||||
callback=self.parse_about_page,
|
|
||||||
meta={"quote_info": quote_info},
|
|
||||||
dont_filter=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
response.urljoin(response.css(".next a::attr(href)").get()),
|
response.urljoin(response.css(".next a::attr(href)").get())
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_about_page(self, response):
|
def parse_about_page(self, response):
|
||||||
quote = response.meta["quote_info"]
|
# We need to parse about page as well
|
||||||
author_born_date = response.css(".author-born-date::text").get()
|
...
|
||||||
quote["author_born_date"] = author_born_date
|
|
||||||
yield quote
|
|
21
code/exercise-solutions/exercise-1.py
Normal file
21
code/exercise-solutions/exercise-1.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class QuotesSpider(scrapy.Spider):
|
||||||
|
name = "quotes"
|
||||||
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
|
start_urls = ["https://quotes.toscrape.com"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
quotes = response.css(".quote")
|
||||||
|
for quote in quotes:
|
||||||
|
yield {
|
||||||
|
"quote": quote.css(".text::text").get(),
|
||||||
|
"author": quote.css(".author::text").get(),
|
||||||
|
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
|
||||||
|
"tags": quote.css(".tag *::text").getall(),
|
||||||
|
}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
response.urljoin(response.css(".next a::attr(href)").get())
|
||||||
|
)
|
31
code/exercise-solutions/exercise-2.py
Normal file
31
code/exercise-solutions/exercise-2.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
name = "quotes_scroll"
|
||||||
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
|
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield scrapy.Request(self.api_url.format(page=1))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
for quote in data.get("quotes"):
|
||||||
|
yield {
|
||||||
|
"quote": quote.get("text"),
|
||||||
|
"author": quote.get("author").get("name"),
|
||||||
|
"author_url": response.urljoin(
|
||||||
|
quote.get("author").get("goodreads_link")
|
||||||
|
),
|
||||||
|
"tags": quote.get("tags"),
|
||||||
|
}
|
||||||
|
|
||||||
|
current_page = data.get("page")
|
||||||
|
|
||||||
|
if data.get("has_next"):
|
||||||
|
next_page = current_page + 1
|
||||||
|
yield scrapy.Request(
|
||||||
|
self.api_url.format(page=next_page),
|
||||||
|
)
|
27
code/exercise-solutions/exercise-3.py
Normal file
27
code/exercise-solutions/exercise-3.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import json
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class QuotesJSSpider(scrapy.Spider):
|
||||||
|
name = "quotes_js"
|
||||||
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
|
start_urls = ["https://quotes.toscrape.com/js/"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
|
||||||
|
|
||||||
|
quotes = json.loads(raw_quotes)
|
||||||
|
|
||||||
|
for quote in quotes:
|
||||||
|
yield {
|
||||||
|
"quote": quote.get("text"),
|
||||||
|
"author": quote.get("author").get("name"),
|
||||||
|
"author_url": response.urljoin(
|
||||||
|
quote.get("author").get("goodreads_link")
|
||||||
|
),
|
||||||
|
"tags": quote.get("tags"),
|
||||||
|
}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
response.urljoin(response.css(".next a::attr(href)").get())
|
||||||
|
)
|
36
code/exercise-solutions/exercise-4.py
Normal file
36
code/exercise-solutions/exercise-4.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class QuotesSpider(scrapy.Spider):
|
||||||
|
name = "quotes_complete"
|
||||||
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
|
start_urls = ["https://quotes.toscrape.com"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
quotes = response.css(".quote")
|
||||||
|
for quote in quotes:
|
||||||
|
about_url = response.urljoin(quote.css("span a::attr(href)").get())
|
||||||
|
|
||||||
|
quote_info = {
|
||||||
|
"quote": quote.css(".text::text").get(),
|
||||||
|
"author": quote.css(".author::text").get(),
|
||||||
|
"author_url": about_url,
|
||||||
|
"tags": quote.css(".tag *::text").getall(),
|
||||||
|
}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
about_url,
|
||||||
|
callback=self.parse_about_page,
|
||||||
|
meta={"quote_info": quote_info},
|
||||||
|
dont_filter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
response.urljoin(response.css(".next a::attr(href)").get()),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_about_page(self, response):
|
||||||
|
quote = response.meta["quote_info"]
|
||||||
|
author_born_date = response.css(".author-born-date::text").get()
|
||||||
|
quote["author_born_date"] = author_born_date
|
||||||
|
yield quote
|
Loading…
Add table
Add a link
Reference in a new issue