reorganizing slides

This commit is contained in:
Renne Rocha 2024-05-12 10:07:36 -03:00
parent 63e275fa2f
commit 7c10d1c4b0

View file

@ -180,7 +180,7 @@ from parsel import Selector
*response = requests.get('https://us.pycon.org/2024/schedule/tutorials/') *response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
sel = Selector(text=response.body) sel = Selector(text=response.text)
for tutorial in sel.css('.calendar a::text').getall(): for tutorial in sel.css('.calendar a::text').getall():
print(tutorial) print(tutorial)
``` ```
@ -196,7 +196,7 @@ from parsel import Selector
response = requests.get('https://us.pycon.org/2024/schedule/tutorials/') response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
sel = Selector(text=response.body) sel = Selector(text=response.text)
*for tutorial in sel.css('.calendar a::text').getall(): *for tutorial in sel.css('.calendar a::text').getall():
* print(tutorial) * print(tutorial)
``` ```
@ -433,6 +433,34 @@ class: center, middle
--- ---
# CSS Selectors Examples
```
response.css("h1")
```
```
response.css("ul#offers")
```
```
response.css(".product")
```
```
response.css("ul#offers .product a::attr(href)")
```
```
response.css("ul#offers .product *::text")
```
```
response.css("ul#offers .product p::text")
```
---
# Parsing Data # Parsing Data
``` ```
@ -487,6 +515,62 @@ class PyConUS2024Spider(scrapy.Spider):
--- ---
# XPath Examples
```
response.xpath("//h1")
```
```
response.xpath("//h1[2]")
```
```
response.xpath("//ul[@id='offers']")
```
```
response.xpath("//li/a/@href")
```
```
response.xpath("//li//text()")
```
```
response.xpath("//li[@class='ad']/following-sibling::li")
```
---
# Parsing Data
```
# code/pyconus2024-xpath.py
import scrapy
class PyConUS2024Spider(scrapy.Spider):
name = "pyconus"
start_urls = [
'https://us.pycon.org/2024/schedule/tutorials/',
]
def parse(self, response):
for tutorial in response.xpath('//div[@class="presentation"]'):
yield {
'speaker': tutorial.xpath(
'./div[@class="speaker"]/text()'
).get().strip(),
'url': response.urljoin(
tutorial.xpath('.//a/@href').get()
),
'title': tutorial.xpath('.//a/text()').get()
}
```
---
# Parsing Data # Parsing Data
``` ```
@ -503,7 +587,9 @@ class PyConUS2024Spider(scrapy.Spider):
def parse(self, response): def parse(self, response):
* for tutorial in response.xpath('//div[@class="presentation"]'): * for tutorial in response.xpath('//div[@class="presentation"]'):
yield { yield {
* 'speaker': tutorial.xpath('./div[@class="speaker"]/text()').get().strip(), * 'speaker': tutorial.xpath(
* './div[@class="speaker"]/text()'
* ).get().strip(),
'url': response.urljoin( 'url': response.urljoin(
* tutorial.xpath('.//a/@href').get() * tutorial.xpath('.//a/@href').get()
), ),
@ -543,62 +629,6 @@ class PyConUS2024Spider(scrapy.Spider):
--- ---
# CSS Selectors Examples
```
response.css("h1")
```
```
response.css("ul#offers")
```
```
response.css(".product")
```
```
response.css("ul#offers .product a::attr(href)")
```
```
response.css("ul#offers .product *::text")
```
```
response.css("ul#offers .product p::text")
```
---
# XPath Examples
```
response.xpath("//h1")
```
```
response.xpath("//h1[2]")
```
```
response.xpath("//ul[@id='offers']")
```
```
response.xpath("//li/a/@href")
```
```
response.xpath("//li//text()")
```
```
response.xpath("//li[@class='ad']/following-sibling::li")
```
---
# Exporting Results # Exporting Results
``` ```
@ -661,7 +691,7 @@ Your task is to extract all of this information and export it into a JSON lines
On this page, you will find a collection of quotes along with their respective authors. On this page, you will find a collection of quotes along with their respective authors.
Each quote is accompanied by a link that directs you to a dedicated page providing Each quote is accompanied by a link that directs you to a dedicated page providing
additional details about the author, the quote itself, and a list of associated tags. additional details about the **author**, the **quote** itself, and a list of **associated tags**.
Your task is to extract all of this information and export it into a JSON lines file. Your task is to extract all of this information and export it into a JSON lines file.
@ -922,11 +952,11 @@ class QuotesSpider(scrapy.Spider):
**Target:** https://quotes.toscrape.com/scroll **Target:** https://quotes.toscrape.com/scroll
There has been another modification to the layout. Our quotes page now features an infinite Our quotes page now features an infinite
scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page. scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page.
**TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to **TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to
open the developer tools and select the "_Network_" tab. Observe what occurs in the network requests open the **developer tools** and select the "_Network_" tab. Observe what occurs in the network requests
when you navigate to the end of the page. when you navigate to the end of the page.
--- ---
@ -963,7 +993,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
data = response.json() data = response.json()
current_page = data.get("page")
for quote in data.get("quotes"): for quote in data.get("quotes"):
yield { yield {
@ -975,6 +1004,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"), "tags": quote.get("tags"),
} }
current_page = data.get("page")
if data.get("has_next"): if data.get("has_next"):
next_page = current_page + 1 next_page = current_page + 1
yield scrapy.Request( yield scrapy.Request(
@ -999,7 +1030,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
data = response.json() data = response.json()
current_page = data.get("page")
for quote in data.get("quotes"): for quote in data.get("quotes"):
yield { yield {
@ -1011,6 +1041,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"), "tags": quote.get("tags"),
} }
current_page = data.get("page")
if data.get("has_next"): if data.get("has_next"):
next_page = current_page + 1 next_page = current_page + 1
yield scrapy.Request( yield scrapy.Request(
@ -1035,7 +1067,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
* data = response.json() * data = response.json()
current_page = data.get("page")
for quote in data.get("quotes"): for quote in data.get("quotes"):
yield { yield {
@ -1047,6 +1078,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"), "tags": quote.get("tags"),
} }
current_page = data.get("page")
if data.get("has_next"): if data.get("has_next"):
next_page = current_page + 1 next_page = current_page + 1
yield scrapy.Request( yield scrapy.Request(
@ -1071,43 +1104,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
data = response.json() data = response.json()
* current_page = data.get("page")
for quote in data.get("quotes"):
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
* if data.get("has_next"):
* next_page = current_page + 1
* yield scrapy.Request(
* self.api_url.format(page=next_page),
* )
```
---
```python
# code/exercise-2.py
import scrapy
class QuotesScrollSpider(scrapy.Spider):
name = "quotes_scroll"
allowed_domains = ["quotes.toscrape.com"]
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
def start_requests(self):
yield scrapy.Request(self.api_url.format(page=1))
def parse(self, response):
data = response.json()
current_page = data.get("page")
* for quote in data.get("quotes"): * for quote in data.get("quotes"):
* yield { * yield {
@ -1119,6 +1115,8 @@ class QuotesScrollSpider(scrapy.Spider):
* "tags": quote.get("tags"), * "tags": quote.get("tags"),
* } * }
current_page = data.get("page")
if data.get("has_next"): if data.get("has_next"):
next_page = current_page + 1 next_page = current_page + 1
yield scrapy.Request( yield scrapy.Request(
@ -1128,12 +1126,48 @@ class QuotesScrollSpider(scrapy.Spider):
--- ---
```python
# code/exercise-2.py
import scrapy
class QuotesScrollSpider(scrapy.Spider):
name = "quotes_scroll"
allowed_domains = ["quotes.toscrape.com"]
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
def start_requests(self):
yield scrapy.Request(self.api_url.format(page=1))
def parse(self, response):
data = response.json()
for quote in data.get("quotes"):
yield {
"quote": quote.get("text"),
"author": quote.get("author").get("name"),
"author_url": response.urljoin(
quote.get("author").get("goodreads_link")
),
"tags": quote.get("tags"),
}
* current_page = data.get("page")
*
* if data.get("has_next"):
* next_page = current_page + 1
* yield scrapy.Request(
* self.api_url.format(page=next_page),
* )
```
---
# Exercise 3 # Exercise 3
**Target:** https://quotes.toscrape.com/js/ **Target:** https://quotes.toscrape.com/js/
The spider you created in the first exercise has ceased to function. Although no errors The spider you created in the first exercise has ceased to function. Although no errors are evident in the logs, the spider is not returning any data.
are evident in the logs, the spider is not returning any data.
**TIP**: To troubleshoot, open your browser and navigate to our target page. **TIP**: To troubleshoot, open your browser and navigate to our target page.
Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page. Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.