reorganizing slides
This commit is contained in:
parent
63e275fa2f
commit
7c10d1c4b0
1 changed files with 138 additions and 104 deletions
|
@ -180,7 +180,7 @@ from parsel import Selector
|
||||||
|
|
||||||
*response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
|
*response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
|
||||||
|
|
||||||
sel = Selector(text=response.body)
|
sel = Selector(text=response.text)
|
||||||
for tutorial in sel.css('.calendar a::text').getall():
|
for tutorial in sel.css('.calendar a::text').getall():
|
||||||
print(tutorial)
|
print(tutorial)
|
||||||
```
|
```
|
||||||
|
@ -196,7 +196,7 @@ from parsel import Selector
|
||||||
|
|
||||||
response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
|
response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
|
||||||
|
|
||||||
sel = Selector(text=response.body)
|
sel = Selector(text=response.text)
|
||||||
*for tutorial in sel.css('.calendar a::text').getall():
|
*for tutorial in sel.css('.calendar a::text').getall():
|
||||||
* print(tutorial)
|
* print(tutorial)
|
||||||
```
|
```
|
||||||
|
@ -433,6 +433,34 @@ class: center, middle
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
# CSS Selectors Examples
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css("h1")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css("ul#offers")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css(".product")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css("ul#offers .product a::attr(href)")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css("ul#offers .product *::text")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.css("ul#offers .product p::text")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
# Parsing Data
|
# Parsing Data
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -487,6 +515,62 @@ class PyConUS2024Spider(scrapy.Spider):
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
# XPath Examples
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//h1")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//h1[2]")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//ul[@id='offers']")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//li/a/@href")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//li//text()")
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
response.xpath("//li[@class='ad']/following-sibling::li")
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
# Parsing Data
|
||||||
|
|
||||||
|
```
|
||||||
|
# code/pyconus2024-xpath.py
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
class PyConUS2024Spider(scrapy.Spider):
|
||||||
|
name = "pyconus"
|
||||||
|
|
||||||
|
start_urls = [
|
||||||
|
'https://us.pycon.org/2024/schedule/tutorials/',
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for tutorial in response.xpath('//div[@class="presentation"]'):
|
||||||
|
yield {
|
||||||
|
'speaker': tutorial.xpath(
|
||||||
|
'./div[@class="speaker"]/text()'
|
||||||
|
).get().strip(),
|
||||||
|
'url': response.urljoin(
|
||||||
|
tutorial.xpath('.//a/@href').get()
|
||||||
|
),
|
||||||
|
'title': tutorial.xpath('.//a/text()').get()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
# Parsing Data
|
# Parsing Data
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -503,7 +587,9 @@ class PyConUS2024Spider(scrapy.Spider):
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
* for tutorial in response.xpath('//div[@class="presentation"]'):
|
* for tutorial in response.xpath('//div[@class="presentation"]'):
|
||||||
yield {
|
yield {
|
||||||
* 'speaker': tutorial.xpath('./div[@class="speaker"]/text()').get().strip(),
|
* 'speaker': tutorial.xpath(
|
||||||
|
* './div[@class="speaker"]/text()'
|
||||||
|
* ).get().strip(),
|
||||||
'url': response.urljoin(
|
'url': response.urljoin(
|
||||||
* tutorial.xpath('.//a/@href').get()
|
* tutorial.xpath('.//a/@href').get()
|
||||||
),
|
),
|
||||||
|
@ -543,62 +629,6 @@ class PyConUS2024Spider(scrapy.Spider):
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# CSS Selectors Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css("h1")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css("ul#offers")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css(".product")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css("ul#offers .product a::attr(href)")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css("ul#offers .product *::text")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.css("ul#offers .product p::text")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# XPath Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//h1")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//h1[2]")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//ul[@id='offers']")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//li/a/@href")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//li//text()")
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
response.xpath("//li[@class='ad']/following-sibling::li")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Exporting Results
|
# Exporting Results
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -661,7 +691,7 @@ Your task is to extract all of this information and export it into a JSON lines
|
||||||
|
|
||||||
On this page, you will find a collection of quotes along with their respective authors.
|
On this page, you will find a collection of quotes along with their respective authors.
|
||||||
Each quote is accompanied by a link that directs you to a dedicated page providing
|
Each quote is accompanied by a link that directs you to a dedicated page providing
|
||||||
additional details about the author, the quote itself, and a list of associated tags.
|
additional details about the **author**, the **quote** itself, and a list of **associated tags**.
|
||||||
|
|
||||||
Your task is to extract all of this information and export it into a JSON lines file.
|
Your task is to extract all of this information and export it into a JSON lines file.
|
||||||
|
|
||||||
|
@ -922,11 +952,11 @@ class QuotesSpider(scrapy.Spider):
|
||||||
|
|
||||||
**Target:** https://quotes.toscrape.com/scroll
|
**Target:** https://quotes.toscrape.com/scroll
|
||||||
|
|
||||||
There has been another modification to the layout. Our quotes page now features an infinite
|
Our quotes page now features an infinite
|
||||||
scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page.
|
scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page.
|
||||||
|
|
||||||
**TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to
|
**TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to
|
||||||
open the developer tools and select the "_Network_" tab. Observe what occurs in the network requests
|
open the **developer tools** and select the "_Network_" tab. Observe what occurs in the network requests
|
||||||
when you navigate to the end of the page.
|
when you navigate to the end of the page.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -963,7 +993,6 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
data = response.json()
|
data = response.json()
|
||||||
current_page = data.get("page")
|
|
||||||
|
|
||||||
for quote in data.get("quotes"):
|
for quote in data.get("quotes"):
|
||||||
yield {
|
yield {
|
||||||
|
@ -975,6 +1004,8 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
"tags": quote.get("tags"),
|
"tags": quote.get("tags"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
current_page = data.get("page")
|
||||||
|
|
||||||
if data.get("has_next"):
|
if data.get("has_next"):
|
||||||
next_page = current_page + 1
|
next_page = current_page + 1
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
|
@ -999,7 +1030,6 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
data = response.json()
|
data = response.json()
|
||||||
current_page = data.get("page")
|
|
||||||
|
|
||||||
for quote in data.get("quotes"):
|
for quote in data.get("quotes"):
|
||||||
yield {
|
yield {
|
||||||
|
@ -1011,6 +1041,8 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
"tags": quote.get("tags"),
|
"tags": quote.get("tags"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
current_page = data.get("page")
|
||||||
|
|
||||||
if data.get("has_next"):
|
if data.get("has_next"):
|
||||||
next_page = current_page + 1
|
next_page = current_page + 1
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
|
@ -1035,7 +1067,6 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
* data = response.json()
|
* data = response.json()
|
||||||
current_page = data.get("page")
|
|
||||||
|
|
||||||
for quote in data.get("quotes"):
|
for quote in data.get("quotes"):
|
||||||
yield {
|
yield {
|
||||||
|
@ -1047,6 +1078,8 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
"tags": quote.get("tags"),
|
"tags": quote.get("tags"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
current_page = data.get("page")
|
||||||
|
|
||||||
if data.get("has_next"):
|
if data.get("has_next"):
|
||||||
next_page = current_page + 1
|
next_page = current_page + 1
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
|
@ -1071,43 +1104,6 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
data = response.json()
|
data = response.json()
|
||||||
* current_page = data.get("page")
|
|
||||||
|
|
||||||
for quote in data.get("quotes"):
|
|
||||||
yield {
|
|
||||||
"quote": quote.get("text"),
|
|
||||||
"author": quote.get("author").get("name"),
|
|
||||||
"author_url": response.urljoin(
|
|
||||||
quote.get("author").get("goodreads_link")
|
|
||||||
),
|
|
||||||
"tags": quote.get("tags"),
|
|
||||||
}
|
|
||||||
|
|
||||||
* if data.get("has_next"):
|
|
||||||
* next_page = current_page + 1
|
|
||||||
* yield scrapy.Request(
|
|
||||||
* self.api_url.format(page=next_page),
|
|
||||||
* )
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
```python
|
|
||||||
# code/exercise-2.py
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class QuotesScrollSpider(scrapy.Spider):
|
|
||||||
name = "quotes_scroll"
|
|
||||||
allowed_domains = ["quotes.toscrape.com"]
|
|
||||||
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
|
|
||||||
|
|
||||||
def start_requests(self):
|
|
||||||
yield scrapy.Request(self.api_url.format(page=1))
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
data = response.json()
|
|
||||||
current_page = data.get("page")
|
|
||||||
|
|
||||||
* for quote in data.get("quotes"):
|
* for quote in data.get("quotes"):
|
||||||
* yield {
|
* yield {
|
||||||
|
@ -1119,6 +1115,8 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
* "tags": quote.get("tags"),
|
* "tags": quote.get("tags"),
|
||||||
* }
|
* }
|
||||||
|
|
||||||
|
current_page = data.get("page")
|
||||||
|
|
||||||
if data.get("has_next"):
|
if data.get("has_next"):
|
||||||
next_page = current_page + 1
|
next_page = current_page + 1
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
|
@ -1128,12 +1126,48 @@ class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
```python
|
||||||
|
# code/exercise-2.py
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class QuotesScrollSpider(scrapy.Spider):
|
||||||
|
name = "quotes_scroll"
|
||||||
|
allowed_domains = ["quotes.toscrape.com"]
|
||||||
|
api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield scrapy.Request(self.api_url.format(page=1))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
for quote in data.get("quotes"):
|
||||||
|
yield {
|
||||||
|
"quote": quote.get("text"),
|
||||||
|
"author": quote.get("author").get("name"),
|
||||||
|
"author_url": response.urljoin(
|
||||||
|
quote.get("author").get("goodreads_link")
|
||||||
|
),
|
||||||
|
"tags": quote.get("tags"),
|
||||||
|
}
|
||||||
|
|
||||||
|
* current_page = data.get("page")
|
||||||
|
*
|
||||||
|
* if data.get("has_next"):
|
||||||
|
* next_page = current_page + 1
|
||||||
|
* yield scrapy.Request(
|
||||||
|
* self.api_url.format(page=next_page),
|
||||||
|
* )
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
# Exercise 3
|
# Exercise 3
|
||||||
|
|
||||||
**Target:** https://quotes.toscrape.com/js/
|
**Target:** https://quotes.toscrape.com/js/
|
||||||
|
|
||||||
The spider you created in the first exercise has ceased to function. Although no errors
|
The spider you created in the first exercise has ceased to function. Although no errors are evident in the logs, the spider is not returning any data.
|
||||||
are evident in the logs, the spider is not returning any data.
|
|
||||||
|
|
||||||
**TIP**: To troubleshoot, open your browser and navigate to our target page.
|
**TIP**: To troubleshoot, open your browser and navigate to our target page.
|
||||||
Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.
|
Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue