reorganizing slides

2024-05-12 10:07:36 -03:00 · 2024-05-12 10:07:36 -03:00 · 7c10d1c4b0
commit 7c10d1c4b0
parent 63e275fa2f
1 changed files with 138 additions and 104 deletions
--- a/presentation/presentation.html
+++ b/presentation/presentation.html
@ -180,7 +180,7 @@ from parsel import Selector

 *response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')

-sel = Selector(text=response.body)
+sel = Selector(text=response.text)
 for tutorial in sel.css('.calendar a::text').getall():
    print(tutorial)
 ```
@ -196,7 +196,7 @@ from parsel import Selector

 response = requests.get('https://us.pycon.org/2024/schedule/tutorials/')

-sel = Selector(text=response.body)
+sel = Selector(text=response.text)
 *for tutorial in sel.css('.calendar a::text').getall():
 *   print(tutorial)
 ```
@ -433,6 +433,34 @@ class: center, middle

 ---

+# CSS Selectors Examples
+
+```
+response.css("h1")
+```
+
+```
+response.css("ul#offers")
+```
+
+```
+response.css(".product")
+```
+
+```
+response.css("ul#offers .product a::attr(href)")
+```
+
+```
+response.css("ul#offers .product *::text")
+```
+
+```
+response.css("ul#offers .product p::text")
+```
+
+---
+
 # Parsing Data

 ```
@ -487,6 +515,62 @@ class PyConUS2024Spider(scrapy.Spider):

 ---

+# XPath Examples
+
+```
+response.xpath("//h1")
+```
+
+```
+response.xpath("//h1[2]")
+```
+
+```
+response.xpath("//ul[@id='offers']")
+```
+
+```
+response.xpath("//li/a/@href")
+```
+
+```
+response.xpath("//li//text()")
+```
+
+```
+response.xpath("//li[@class='ad']/following-sibling::li")
+```
+---
+
+
+# Parsing Data
+
+```
+# code/pyconus2024-xpath.py
+import scrapy
+
+class PyConUS2024Spider(scrapy.Spider):
+    name = "pyconus"
+
+    start_urls = [
+        'https://us.pycon.org/2024/schedule/tutorials/',
+    ]
+
+    def parse(self, response):
+        for tutorial in response.xpath('//div[@class="presentation"]'):
+            yield {
+                'speaker': tutorial.xpath(
+                    './div[@class="speaker"]/text()'
+                ).get().strip(),
+                'url': response.urljoin(
+                    tutorial.xpath('.//a/@href').get()
+                ),
+                'title': tutorial.xpath('.//a/text()').get()
+            }
+```
+
+---
+
 # Parsing Data

 ```
@ -503,7 +587,9 @@ class PyConUS2024Spider(scrapy.Spider):
    def parse(self, response):
 *       for tutorial in response.xpath('//div[@class="presentation"]'):
            yield {
-*               'speaker': tutorial.xpath('./div[@class="speaker"]/text()').get().strip(),
+*               'speaker': tutorial.xpath(
+*                   './div[@class="speaker"]/text()'
+*               ).get().strip(),
                'url': response.urljoin(
 *                   tutorial.xpath('.//a/@href').get()
                ),
@ -543,62 +629,6 @@ class PyConUS2024Spider(scrapy.Spider):

 ---

-# CSS Selectors Examples
-
-```
-response.css("h1")
-```
-
-```
-response.css("ul#offers")
-```
-
-```
-response.css(".product")
-```
-
-```
-response.css("ul#offers .product a::attr(href)")
-```
-
-```
-response.css("ul#offers .product *::text")
-```
-
-```
-response.css("ul#offers .product p::text")
-```
-
---
-
-# XPath Examples
-
-```
-response.xpath("//h1")
-```
-
-```
-response.xpath("//h1[2]")
-```
-
-```
-response.xpath("//ul[@id='offers']")
-```
-
-```
-response.xpath("//li/a/@href")
-```
-
-```
-response.xpath("//li//text()")
-```
-
-```
-response.xpath("//li[@class='ad']/following-sibling::li")
-```
-
---
-
 # Exporting Results

 ```
@ -661,7 +691,7 @@ Your task is to extract all of this information and export it into a JSON lines

 On this page, you will find a collection of quotes along with their respective authors.
 Each quote is accompanied by a link that directs you to a dedicated page providing
-additional details about the author, the quote itself, and a list of associated tags.
+additional details about the **author**, the **quote** itself, and a list of **associated tags**.

 Your task is to extract all of this information and export it into a JSON lines file.

@ -922,11 +952,11 @@ class QuotesSpider(scrapy.Spider):

 **Target:** https://quotes.toscrape.com/scroll

-There has been another modification to the layout. Our quotes page now features an infinite
+Our quotes page now features an infinite
 scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page.

 **TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to
-open the developer tools and select the "_Network_" tab. Observe what occurs in the network requests
+open the **developer tools** and select the "_Network_" tab. Observe what occurs in the network requests
 when you navigate to the end of the page.

 ---
@ -963,7 +993,6 @@ class QuotesScrollSpider(scrapy.Spider):

    def parse(self, response):
        data = response.json()
-        current_page = data.get("page")

        for quote in data.get("quotes"):
            yield {
@ -975,6 +1004,8 @@ class QuotesScrollSpider(scrapy.Spider):
                "tags": quote.get("tags"),
            }

+        current_page = data.get("page")
+
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
@ -999,7 +1030,6 @@ class QuotesScrollSpider(scrapy.Spider):

    def parse(self, response):
        data = response.json()
-        current_page = data.get("page")

        for quote in data.get("quotes"):
            yield {
@ -1011,6 +1041,8 @@ class QuotesScrollSpider(scrapy.Spider):
                "tags": quote.get("tags"),
            }

+        current_page = data.get("page")
+
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
@ -1035,7 +1067,6 @@ class QuotesScrollSpider(scrapy.Spider):

    def parse(self, response):
 *       data = response.json()
-        current_page = data.get("page")

        for quote in data.get("quotes"):
            yield {
@ -1047,6 +1078,8 @@ class QuotesScrollSpider(scrapy.Spider):
                "tags": quote.get("tags"),
            }

+        current_page = data.get("page")
+
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
@ -1071,43 +1104,6 @@ class QuotesScrollSpider(scrapy.Spider):

    def parse(self, response):
        data = response.json()
-*       current_page = data.get("page")
-
-        for quote in data.get("quotes"):
-            yield {
-                "quote": quote.get("text"),
-                "author": quote.get("author").get("name"),
-                "author_url": response.urljoin(
-                    quote.get("author").get("goodreads_link")
-                ),
-                "tags": quote.get("tags"),
-            }
-
-*       if data.get("has_next"):
-*           next_page = current_page + 1
-*           yield scrapy.Request(
-*               self.api_url.format(page=next_page),
-*           )
-```
-
---
-
-```python
-# code/exercise-2.py
-import scrapy
-
-
-class QuotesScrollSpider(scrapy.Spider):
-    name = "quotes_scroll"
-    allowed_domains = ["quotes.toscrape.com"]
-    api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
-
-    def start_requests(self):
-        yield scrapy.Request(self.api_url.format(page=1))
-
-    def parse(self, response):
-        data = response.json()
-        current_page = data.get("page")

 *       for quote in data.get("quotes"):
 *           yield {
@ -1119,6 +1115,8 @@ class QuotesScrollSpider(scrapy.Spider):
 *               "tags": quote.get("tags"),
 *           }

+        current_page = data.get("page")
+
        if data.get("has_next"):
            next_page = current_page + 1
            yield scrapy.Request(
@ -1128,12 +1126,48 @@ class QuotesScrollSpider(scrapy.Spider):

 ---

+```python
+# code/exercise-2.py
+import scrapy
+
+
+class QuotesScrollSpider(scrapy.Spider):
+    name = "quotes_scroll"
+    allowed_domains = ["quotes.toscrape.com"]
+    api_url = "https://quotes.toscrape.com/api/quotes?page={page}"
+
+    def start_requests(self):
+        yield scrapy.Request(self.api_url.format(page=1))
+
+    def parse(self, response):
+        data = response.json()
+
+        for quote in data.get("quotes"):
+            yield {
+                "quote": quote.get("text"),
+                "author": quote.get("author").get("name"),
+                "author_url": response.urljoin(
+                    quote.get("author").get("goodreads_link")
+                ),
+                "tags": quote.get("tags"),
+            }
+
+*       current_page = data.get("page")
+*
+*       if data.get("has_next"):
+*           next_page = current_page + 1
+*           yield scrapy.Request(
+*               self.api_url.format(page=next_page),
+*           )
+```
+
+---
+
 # Exercise 3

 **Target:** https://quotes.toscrape.com/js/

-The spider you created in the first exercise has ceased to function. Although no errors
-are evident in the logs, the spider is not returning any data.
+The spider you created in the first exercise has ceased to function. Although no errors are evident in the logs, the spider is not returning any data.

 **TIP**: To troubleshoot, open your browser and navigate to our target page.
 Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.