From 7c10d1c4b0ac4c266d532e7f475980f5e112e3ec Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Sun, 12 May 2024 10:07:36 -0300 Subject: [PATCH] reorganizing slides --- presentation/presentation.html | 242 +++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 104 deletions(-) diff --git a/presentation/presentation.html b/presentation/presentation.html index 2710895..59626f5 100644 --- a/presentation/presentation.html +++ b/presentation/presentation.html @@ -180,7 +180,7 @@ from parsel import Selector *response = requests.get('https://us.pycon.org/2024/schedule/tutorials/') -sel = Selector(text=response.body) +sel = Selector(text=response.text) for tutorial in sel.css('.calendar a::text').getall(): print(tutorial) ``` @@ -196,7 +196,7 @@ from parsel import Selector response = requests.get('https://us.pycon.org/2024/schedule/tutorials/') -sel = Selector(text=response.body) +sel = Selector(text=response.text) *for tutorial in sel.css('.calendar a::text').getall(): * print(tutorial) ``` @@ -433,6 +433,34 @@ class: center, middle --- +# CSS Selectors Examples + +``` +response.css("h1") +``` + +``` +response.css("ul#offers") +``` + +``` +response.css(".product") +``` + +``` +response.css("ul#offers .product a::attr(href)") +``` + +``` +response.css("ul#offers .product *::text") +``` + +``` +response.css("ul#offers .product p::text") +``` + +--- + # Parsing Data ``` @@ -487,6 +515,62 @@ class PyConUS2024Spider(scrapy.Spider): --- +# XPath Examples + +``` +response.xpath("//h1") +``` + +``` +response.xpath("//h1[2]") +``` + +``` +response.xpath("//ul[@id='offers']") +``` + +``` +response.xpath("//li/a/@href") +``` + +``` +response.xpath("//li//text()") +``` + +``` +response.xpath("//li[@class='ad']/following-sibling::li") +``` +--- + + +# Parsing Data + +``` +# code/pyconus2024-xpath.py +import scrapy + +class PyConUS2024Spider(scrapy.Spider): + name = "pyconus" + + start_urls = [ + 'https://us.pycon.org/2024/schedule/tutorials/', + ] + + def parse(self, response): + for tutorial in response.xpath('//div[@class="presentation"]'): + yield { + 'speaker': tutorial.xpath( + './div[@class="speaker"]/text()' + ).get().strip(), + 'url': response.urljoin( + tutorial.xpath('.//a/@href').get() + ), + 'title': tutorial.xpath('.//a/text()').get() + } +``` + +--- + # Parsing Data ``` @@ -503,7 +587,9 @@ class PyConUS2024Spider(scrapy.Spider): def parse(self, response): * for tutorial in response.xpath('//div[@class="presentation"]'): yield { -* 'speaker': tutorial.xpath('./div[@class="speaker"]/text()').get().strip(), +* 'speaker': tutorial.xpath( +* './div[@class="speaker"]/text()' +* ).get().strip(), 'url': response.urljoin( * tutorial.xpath('.//a/@href').get() ), @@ -543,62 +629,6 @@ class PyConUS2024Spider(scrapy.Spider): --- -# CSS Selectors Examples - -``` -response.css("h1") -``` - -``` -response.css("ul#offers") -``` - -``` -response.css(".product") -``` - -``` -response.css("ul#offers .product a::attr(href)") -``` - -``` -response.css("ul#offers .product *::text") -``` - -``` -response.css("ul#offers .product p::text") -``` - ---- - -# XPath Examples - -``` -response.xpath("//h1") -``` - -``` -response.xpath("//h1[2]") -``` - -``` -response.xpath("//ul[@id='offers']") -``` - -``` -response.xpath("//li/a/@href") -``` - -``` -response.xpath("//li//text()") -``` - -``` -response.xpath("//li[@class='ad']/following-sibling::li") -``` - ---- - # Exporting Results ``` @@ -661,7 +691,7 @@ Your task is to extract all of this information and export it into a JSON lines On this page, you will find a collection of quotes along with their respective authors. Each quote is accompanied by a link that directs you to a dedicated page providing -additional details about the author, the quote itself, and a list of associated tags. +additional details about the **author**, the **quote** itself, and a list of **associated tags**. Your task is to extract all of this information and export it into a JSON lines file. @@ -922,11 +952,11 @@ class QuotesSpider(scrapy.Spider): **Target:** https://quotes.toscrape.com/scroll -There has been another modification to the layout. Our quotes page now features an infinite +Our quotes page now features an infinite scroll functionality, meaning that new content is dynamically loaded as you reach the bottom of the page. **TIP**: To understand this behavior, open your browser and access our target page. Press **F12** to -open the developer tools and select the "_Network_" tab. Observe what occurs in the network requests +open the **developer tools** and select the "_Network_" tab. Observe what occurs in the network requests when you navigate to the end of the page. --- @@ -963,7 +993,6 @@ class QuotesScrollSpider(scrapy.Spider): def parse(self, response): data = response.json() - current_page = data.get("page") for quote in data.get("quotes"): yield { @@ -975,6 +1004,8 @@ class QuotesScrollSpider(scrapy.Spider): "tags": quote.get("tags"), } + current_page = data.get("page") + if data.get("has_next"): next_page = current_page + 1 yield scrapy.Request( @@ -999,7 +1030,6 @@ class QuotesScrollSpider(scrapy.Spider): def parse(self, response): data = response.json() - current_page = data.get("page") for quote in data.get("quotes"): yield { @@ -1011,6 +1041,8 @@ class QuotesScrollSpider(scrapy.Spider): "tags": quote.get("tags"), } + current_page = data.get("page") + if data.get("has_next"): next_page = current_page + 1 yield scrapy.Request( @@ -1035,7 +1067,6 @@ class QuotesScrollSpider(scrapy.Spider): def parse(self, response): * data = response.json() - current_page = data.get("page") for quote in data.get("quotes"): yield { @@ -1047,6 +1078,8 @@ class QuotesScrollSpider(scrapy.Spider): "tags": quote.get("tags"), } + current_page = data.get("page") + if data.get("has_next"): next_page = current_page + 1 yield scrapy.Request( @@ -1071,43 +1104,6 @@ class QuotesScrollSpider(scrapy.Spider): def parse(self, response): data = response.json() -* current_page = data.get("page") - - for quote in data.get("quotes"): - yield { - "quote": quote.get("text"), - "author": quote.get("author").get("name"), - "author_url": response.urljoin( - quote.get("author").get("goodreads_link") - ), - "tags": quote.get("tags"), - } - -* if data.get("has_next"): -* next_page = current_page + 1 -* yield scrapy.Request( -* self.api_url.format(page=next_page), -* ) -``` - ---- - -```python -# code/exercise-2.py -import scrapy - - -class QuotesScrollSpider(scrapy.Spider): - name = "quotes_scroll" - allowed_domains = ["quotes.toscrape.com"] - api_url = "https://quotes.toscrape.com/api/quotes?page={page}" - - def start_requests(self): - yield scrapy.Request(self.api_url.format(page=1)) - - def parse(self, response): - data = response.json() - current_page = data.get("page") * for quote in data.get("quotes"): * yield { @@ -1119,6 +1115,8 @@ class QuotesScrollSpider(scrapy.Spider): * "tags": quote.get("tags"), * } + current_page = data.get("page") + if data.get("has_next"): next_page = current_page + 1 yield scrapy.Request( @@ -1128,12 +1126,48 @@ class QuotesScrollSpider(scrapy.Spider): --- +```python +# code/exercise-2.py +import scrapy + + +class QuotesScrollSpider(scrapy.Spider): + name = "quotes_scroll" + allowed_domains = ["quotes.toscrape.com"] + api_url = "https://quotes.toscrape.com/api/quotes?page={page}" + + def start_requests(self): + yield scrapy.Request(self.api_url.format(page=1)) + + def parse(self, response): + data = response.json() + + for quote in data.get("quotes"): + yield { + "quote": quote.get("text"), + "author": quote.get("author").get("name"), + "author_url": response.urljoin( + quote.get("author").get("goodreads_link") + ), + "tags": quote.get("tags"), + } + +* current_page = data.get("page") +* +* if data.get("has_next"): +* next_page = current_page + 1 +* yield scrapy.Request( +* self.api_url.format(page=next_page), +* ) +``` + +--- + # Exercise 3 **Target:** https://quotes.toscrape.com/js/ -The spider you created in the first exercise has ceased to function. Although no errors -are evident in the logs, the spider is not returning any data. +The spider you created in the first exercise has ceased to function. Although no errors are evident in the logs, the spider is not returning any data. **TIP**: To troubleshoot, open your browser and navigate to our target page. Press **Ctrl+U** (_View Page Source_) to inspect the HTML content of the page.