diff --git a/presentation/presentation.html b/presentation/presentation.html
index 2710895..59626f5 100644
--- a/presentation/presentation.html
+++ b/presentation/presentation.html
@@ -180,7 +180,7 @@ from parsel import Selector
*response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
-sel = Selector(text=response.body)
+sel = Selector(text=response.text)
for tutorial in sel.css('.calendar a::text').getall():
print(tutorial)
```
@@ -196,7 +196,7 @@ from parsel import Selector
response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')
-sel = Selector(text=response.body)
+sel = Selector(text=response.text)
*for tutorial in sel.css('.calendar a::text').getall():
* print(tutorial)
```
@@ -433,6 +433,34 @@ class: center, middle
---
+# CSS Selectors Examples
+
+```
+response.css("h1")
+```
+
+```
+response.css("ul#offers")
+```
+
+```
+response.css(".product")
+```
+
+```
+response.css("ul#offers .product a::attr(href)")
+```
+
+```
+response.css("ul#offers .product *::text")
+```
+
+```
+response.css("ul#offers .product p::text")
+```
+
+---
+
# Parsing Data
```
@@ -487,6 +515,62 @@ class PyConUS2024Spider(scrapy.Spider):
---
+# XPath Examples
+
+```
+response.xpath("//h1")
+```
+
+```
+response.xpath("//h1[2]")
+```
+
+```
+response.xpath("//ul[@id='offers']")
+```
+
+```
+response.xpath("//li/a/@href")
+```
+
+```
+response.xpath("//li//text()")
+```
+
+```
+response.xpath("//li[@class='ad']/following-sibling::li")
+```
+---
+
+
+# Parsing Data
+
+```
+# code/pyconus2024-xpath.py
+import scrapy
+
+class PyConUS2024Spider(scrapy.Spider):
+ name = "pyconus"
+
+ start_urls = [
+ 'https://us.pycon.org/2024/schedule/tutorials/',
+ ]
+
+ def parse(self, response):
+ for tutorial in response.xpath('//div[@class="presentation"]'):
+ yield {
+ 'speaker': tutorial.xpath(
+ './div[@class="speaker"]/text()'
+ ).get().strip(),
+ 'url': response.urljoin(
+ tutorial.xpath('.//a/@href').get()
+ ),
+ 'title': tutorial.xpath('.//a/text()').get()
+ }
+```
+
+---
+
# Parsing Data
```
@@ -503,7 +587,9 @@ class PyConUS2024Spider(scrapy.Spider):
def parse(self, response):
* for tutorial in response.xpath('//div[@class="presentation"]'):
yield {
-* 'speaker': tutorial.xpath('./div[@class="speaker"]/text()').get().strip(),
+* 'speaker': tutorial.xpath(
+* './div[@class="speaker"]/text()'
+* ).get().strip(),
'url': response.urljoin(
* tutorial.xpath('.//a/@href').get()
),
@@ -543,62 +629,6 @@ class PyConUS2024Spider(scrapy.Spider):
---
-# CSS Selectors Examples
-
-```
-response.css("h1")
-```
-
-```
-response.css("ul#offers")
-```
-
-```
-response.css(".product")
-```
-
-```
-response.css("ul#offers .product a::attr(href)")
-```
-
-```
-response.css("ul#offers .product *::text")
-```
-
-```
-response.css("ul#offers .product p::text")
-```
-
----
-
-# XPath Examples
-
-```
-response.xpath("//h1")
-```
-
-```
-response.xpath("//h1[2]")
-```
-
-```
-response.xpath("//ul[@id='offers']")
-```
-
-```
-response.xpath("//li/a/@href")
-```
-
-```
-response.xpath("//li//text()")
-```
-
-```
-response.xpath("//li[@class='ad']/following-sibling::li")
-```
-
----
-
# Exporting Results
```
@@ -661,7 +691,7 @@ Your task is to extract all of this information and export it into a JSON lines
On this page, you will find a collection of quotes along with their respective authors.
Each quote is accompanied by a link that directs you to a dedicated page providing
-additional details about the author, the quote itself, and a list of associated tags.
+additional details about the **author**, the **quote** itself, and a list of **associated tags**.
Your task is to extract all of this information and export it into a JSON lines file.
@@ -922,11 +952,11 @@ class QuotesSpider(scrapy.Spider):
**Target:** https://quotes.toscrape.com/scroll
-There has been another modification to the layout. Our quotes page now features an infinite
+Our quotes page now features an infinite
scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page.
**TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to
-open the developer tools and select the "_Network_" tab. Observe what occurs in the network requests
+open the **developer tools** and select the "_Network_" tab. Observe what occurs in the network requests
when you navigate to the end of the page.
---
@@ -963,7 +993,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response):
data = response.json()
- current_page = data.get("page")
for quote in data.get("quotes"):
yield {
@@ -975,6 +1004,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"),
}
+ current_page = data.get("page")
+
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
@@ -999,7 +1030,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response):
data = response.json()
- current_page = data.get("page")
for quote in data.get("quotes"):
yield {
@@ -1011,6 +1041,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"),
}
+ current_page = data.get("page")
+
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
@@ -1035,7 +1067,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response):
* data = response.json()
- current_page = data.get("page")
for quote in data.get("quotes"):
yield {
@@ -1047,6 +1078,8 @@ class QuotesScrollSpider(scrapy.Spider):
"tags": quote.get("tags"),
}
+ current_page = data.get("page")
+
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
@@ -1071,43 +1104,6 @@ class QuotesScrollSpider(scrapy.Spider):
def parse(self, response):
data = response.json()
-* current_page = data.get("page")
-
- for quote in data.get("quotes"):
- yield {
- "quote": quote.get("text"),
- "author": quote.get("author").get("name"),
- "author_url": response.urljoin(
- quote.get("author").get("goodreads_link")
- ),
- "tags": quote.get("tags"),
- }
-
-* if data.get("has_next"):
-* next_page = current_page + 1
-* yield scrapy.Request(
-* self.api_url.format(page=next_page),
-* )
-```
-
----
-
-```python
-# code/exercise-2.py
-import scrapy
-
-
-class QuotesScrollSpider(scrapy.Spider):
- name = "quotes_scroll"
- allowed_domains = ["quotes.toscrape.com"]
- api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
-
- def start_requests(self):
- yield scrapy.Request(self.api_url.format(page=1))
-
- def parse(self, response):
- data = response.json()
- current_page = data.get("page")
* for quote in data.get("quotes"):
* yield {
@@ -1119,6 +1115,8 @@ class QuotesScrollSpider(scrapy.Spider):
* "tags": quote.get("tags"),
* }
+ current_page = data.get("page")
+
if data.get("has_next"):
next_page = current_page + 1
yield scrapy.Request(
@@ -1128,12 +1126,48 @@ class QuotesScrollSpider(scrapy.Spider):
---
+```python
+# code/exercise-2.py
+import scrapy
+
+
+class QuotesScrollSpider(scrapy.Spider):
+ name = "quotes_scroll"
+ allowed_domains = ["quotes.toscrape.com"]
+ api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
+
+ def start_requests(self):
+ yield scrapy.Request(self.api_url.format(page=1))
+
+ def parse(self, response):
+ data = response.json()
+
+ for quote in data.get("quotes"):
+ yield {
+ "quote": quote.get("text"),
+ "author": quote.get("author").get("name"),
+ "author_url": response.urljoin(
+ quote.get("author").get("goodreads_link")
+ ),
+ "tags": quote.get("tags"),
+ }
+
+* current_page = data.get("page")
+*
+* if data.get("has_next"):
+* next_page = current_page + 1
+* yield scrapy.Request(
+* self.api_url.format(page=next_page),
+* )
+```
+
+---
+
# Exercise 3
**Target:** https://quotes.toscrape.com/js/
-The spider you created in the first exercise has ceased to function. Although no errors
-are evident in the logs, the spider is not returning any data.
+The spider you created in the first exercise has ceased to function. Although no errors are evident in the logs, the spider is not returning any data.
**TIP**: To troubleshoot, open your browser and navigate to our target page.
Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.