Final version

This commit is contained in:
Renne Rocha 2024-05-07 16:17:43 -03:00
parent d7fd3dd578
commit 63e275fa2f
30 changed files with 2918 additions and 5 deletions

21
code/exercise-1.py Normal file
View file

@ -0,0 +1,21 @@
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)

28
code/exercise-2.py Normal file
View file

@ -0,0 +1,28 @@
import scrapy
class QuotesScrollSpider(scrapy.Spider):
name = "quotes_scroll"
allowed_domains = ["quotes.toscrape.com"]
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
def start_requests(self):
yield scrapy.Request(self.api_url.format(page=1))
def parse(self, response):
data = response.json()
current_page = data.get("page")
for quote in data.get("quotes"):
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
self.api_url.format(page=next_page),
)

24
code/exercise-3.py Normal file
View file

@ -0,0 +1,24 @@
import json
import scrapy
class QuotesJSSpider(scrapy.Spider):
name = "quotes_js"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com/js/"]
def parse(self, response):
raw_quotes = response.xpath("//script").re_first(r"var data = ((?s:\[.*?\]));")
quotes = json.loads(raw_quotes)
for quote in quotes:
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get())
)

36
code/exercise-4.py Normal file
View file

@ -0,0 +1,36 @@
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes_complete"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["https://quotes.toscrape.com"]
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
about_url = response.urljoin(quote.css("span a::attr(href)").get())
quote_info = {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": about_url,
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
about_url,
callback=self.parse_about_page,
meta={"quote_info": quote_info},
dont_filter=True,
)
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get()),
)
def parse_about_page(self, response):
quote = response.meta["quote_info"]
author_born_date = response.css(".author-born-date::text").get()
quote["author_born_date"] = author_born_date
yield quote

View file

@ -0,0 +1,27 @@
import scrapy
class QuotesPlaywrightSpider(scrapy.Spider):
name = "quotes-playwright"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
}
}
def start_requests(self):
yield scrapy.Request(
url="http://quotes.toscrape.com/js/",
meta={
"playwright": False,
},
)
async def parse(self, response):
with open("playwright-disabled.html", "w") as content:
content.write(response.text)

View file

@ -0,0 +1,27 @@
import scrapy
class QuotesPlaywrightSpider(scrapy.Spider):
name = "quotes-playwright"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
},
}
def start_requests(self):
yield scrapy.Request(
url="http://quotes.toscrape.com/js/",
meta={
"playwright": True,
},
)
async def parse(self, response):
with open("playwright-enabled.html", "w") as content:
content.write(response.text)

40
code/playwright-quotes.py Normal file
View file

@ -0,0 +1,40 @@
import scrapy
class QuotesPlaywrightSpider(scrapy.Spider):
name = "quotes-playwright"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
},
}
def start_requests(self):
yield scrapy.Request(
url="http://quotes.toscrape.com/js/",
meta={
"playwright": True,
},
)
async def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
yield {
"quote": quote.css(".text::text").get(),
"author": quote.css(".author::text").get(),
"author_url": response.urljoin(quote.css("span a::attr(href)").get()),
"tags": quote.css(".tag *::text").getall(),
}
yield scrapy.Request(
response.urljoin(response.css(".next a::attr(href)").get()),
meta={
"playwright": True,
},
)

17
code/pyconus2024-css.py Normal file
View file

@ -0,0 +1,17 @@
import scrapy
class PyConUS2024Spider(scrapy.Spider):
name = "pyconus"
start_urls = [
"https://us.pycon.org/2024/schedule/tutorials/",
]
def parse(self, response):
for tutorial in response.css(".presentation"):
yield {
"speaker": tutorial.css(".speaker::text").get().strip(),
"url": response.urljoin(tutorial.css(".title a::attr(href)").get()),
"title": tutorial.css(".title a::text").get(),
}

View file

@ -1,8 +1,8 @@
import requests
from parsel import Selector
response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
response = requests.get("https://us.pycon.org/2024/schedule/tutorials/")
sel = Selector(text=response.text)
for tutorial in sel.css('.calendar a::text').getall():
print(tutorial)
for tutorial in sel.css(".calendar a::text").getall():
print(tutorial)

View file

@ -0,0 +1,19 @@
import scrapy
class PyConUS2024Spider(scrapy.Spider):
name = "pyconus"
start_urls = [
"https://us.pycon.org/2024/schedule/tutorials/",
]
def parse(self, response):
for tutorial in response.xpath('//div[@class="presentation"]'):
yield {
"speaker": tutorial.xpath('./div[@class="speaker"]/text()')
.get()
.strip(),
"url": response.urljoin(tutorial.xpath(".//a/@href").get()),
"title": tutorial.xpath(".//a/text()").get(),
}

19
code/pyconus2024-xpath.py Normal file
View file

@ -0,0 +1,19 @@
import scrapy
class PyConUS2024Spider(scrapy.Spider):
name = "pyconus"
start_urls = [
"https://us.pycon.org/2024/schedule/tutorials/",
]
def parse(self, response):
for tutorial in response.xpath('//div[@class="presentation"]'):
yield {
"speaker": tutorial.xpath('./div[@class="speaker"]/text()')
.get()
.strip(),
"url": response.urljoin(tutorial.xpath(".//a/@href").get()),
"title": tutorial.xpath(".//a/text()").get(),
}

View file

@ -1,12 +1,13 @@
import scrapy
class PyConUS2024Spider(scrapy.Spider):
name = "pyconus"
start_urls = [
'https://us.pycon.org/2024/schedule/tutorials/',
"https://us.pycon.org/2024/schedule/tutorials/",
]
def parse(self, response):
for tutorial in response.css('.calendar a::text').getall():
for tutorial in response.css(".calendar a::text").getall():
yield {"title": tutorial}