Reorganize example code
This commit is contained in:
parent
7c10d1c4b0
commit
6580c266dd
8 changed files with 143 additions and 56 deletions
|
@ -8,17 +8,9 @@ class QuotesJSSpider(scrapy.Spider):
|
|||
start_urls = ["https://quotes.toscrape.com/js/"]
|
||||
|
||||
def parse(self, response):
|
||||
# 1. Find the raw data inside the HTML
|
||||
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
|
||||
quotes = json.loads(raw_quotes)
|
||||
for quote in quotes:
|
||||
yield {
|
||||
"quote": quote.get("text"),
|
||||
"author": quote.get("author").get("name"),
|
||||
"author_url": response.urljoin(
|
||||
quote.get("author").get("goodreads_link")
|
||||
),
|
||||
"tags": quote.get("tags"),
|
||||
}
|
||||
yield scrapy.Request(
|
||||
response.urljoin(response.css(".next a::attr(href)").get())
|
||||
)
|
||||
|
||||
# 2. With the raw data, convert it to Python and parse it
|
||||
|
||||
# 3. Don't forget we have pagination here too
|
Loading…
Add table
Add a link
Reference in a new issue