From a4a9b17c2ddfa4a1d949cda71296a473db67d88c Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Fri, 13 Dec 2024 21:58:20 -0300 Subject: [PATCH] Add Teatro Castro Mendes spider --- data_collector/data_collector/models.py | 1 + data_collector/data_collector/pipelines.py | 2 + .../data_collector/spiders/castro_mendes.py | 94 +++++++++++++++++++ poetry.lock | 30 +++++- pyproject.toml | 1 + 5 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 data_collector/data_collector/spiders/castro_mendes.py diff --git a/data_collector/data_collector/models.py b/data_collector/data_collector/models.py index 49f4a34..bf153bf 100644 --- a/data_collector/data_collector/models.py +++ b/data_collector/data_collector/models.py @@ -17,6 +17,7 @@ class Event(peewee.Model): location = peewee.CharField() published = peewee.BooleanField() image_path = peewee.CharField() + tags = peewee.TextField() class Meta: database = get_db() diff --git a/data_collector/data_collector/pipelines.py b/data_collector/data_collector/pipelines.py index 539b432..ee64845 100644 --- a/data_collector/data_collector/pipelines.py +++ b/data_collector/data_collector/pipelines.py @@ -16,6 +16,7 @@ class DataCollectorPipeline: def process_item(self, item, spider): image_path = item["images"][0]["path"] if item["images"] else None location = re.sub(r"\s", " ", item["location"]) + tags = item.get("tags") or [] event = Event( title=item["title"], @@ -26,6 +27,7 @@ class DataCollectorPipeline: location=location, published=False, image_path=image_path, + tags=",".join(tags), ) try: event.save() diff --git a/data_collector/data_collector/spiders/castro_mendes.py b/data_collector/data_collector/spiders/castro_mendes.py new file mode 100644 index 0000000..b3b2eb9 --- /dev/null +++ b/data_collector/data_collector/spiders/castro_mendes.py @@ -0,0 +1,94 @@ +import copy +import re +from datetime import datetime, timedelta +from urllib.parse import urlencode, urljoin + +import scrapy +from slugify import slugify + +API_KEY = "live_2d2837a14f7043dab07f23022cb4fa3c1034caf1d92c4b198c86402146924c08" + + +class CastroMendesSpider(scrapy.Spider): + name = "castro_mendes" + start_urls = [ + f"https://api.teatrocastromendes.com.br/v1/home/card?apikey={API_KEY}" + ] + + def parse(self, response): + for event in response.json(): + event_url = urljoin("https://teatrocastromendes.com.br", event["uri"]) + event_data = { + "title": event["ds_evento"], + "url": event_url, + "location": event["ds_nome_teatro"], + "image_urls": [ + event["img"], + ], + "tags": [ + "campinas", + "castro-mendes", + slugify(event["genreName"]), + ], + } + + encoded_params = urlencode( + { + "key": event["uri"].replace("/evento/", ""), + "apikey": API_KEY, + } + ) + event_details_url = "https://api.teatrocastromendes.com.br/v1/event/get" + url = f"{event_details_url}?{encoded_params}" + + yield scrapy.Request( + url, + callback=self.parse_event, + cb_kwargs={"event_data": event_data}, + ) + + def parse_event(self, response, event_data): + data = response.json() + event_data["description"] = data["description"] + + encoded_params = urlencode( + { + "id_base": data["id_base"], + "codPeca": data["CodPeca"], + "apikey": API_KEY, + } + ) + event_presentations_url = ( + "https://api.teatrocastromendes.com.br/v1/event/presentation" + ) + url = f"{event_presentations_url}?{encoded_params}" + + yield scrapy.Request( + url, + callback=self.parse_presentations, + cb_kwargs={"event_data": event_data}, + ) + + def parse_presentations(self, response, event_data): + for presentation in response.json(): + presentation_data = copy.deepcopy(event_data) + + hour = presentation["HorSessao"] + day = presentation["day"] + year = presentation["year"] + + start_datetime = datetime.strptime(f"{day}/{year} {hour}", "%d/%m/%Y %H:%M") + presentation_data["start_datetime"] = start_datetime + + duration = 60 # If not able to find event duration in description, use 60min as default + duration_match = re.findall( + r"Duração do espetáculo: (\d+) minutos", event_data["description"] + ) + if duration_match: + duration = int(duration_match.pop()) + end_datetime = datetime.strptime( + f"{day}/{year} {hour}", "%d/%m/%Y %H:%M" + ) + timedelta(minutes=duration) + presentation_data["end_datetime"] = start_datetime + + yield presentation_data diff --git a/poetry.lock b/poetry.lock index 3e2e496..afcaa18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1445,6 +1445,23 @@ files = [ {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, ] +[[package]] +name = "python-slugify" +version = "8.0.4" +description = "A Python slugify application that also handles Unicode" +optional = false +python-versions = ">=3.7" +files = [ + {file = "python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856"}, + {file = "python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8"}, +] + +[package.dependencies] +text-unidecode = ">=1.3" + +[package.extras] +unidecode = ["Unidecode (>=1.1.1)"] + [[package]] name = "pyyaml" version = "6.0.2" @@ -1691,6 +1708,17 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "text-unidecode" +version = "1.3" +description = "The most basic Text::Unidecode port" +optional = false +python-versions = "*" +files = [ + {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, + {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, +] + [[package]] name = "tldextract" version = "5.1.3" @@ -1889,4 +1917,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.13" -content-hash = "4ea6ce1ed26377f323e9ce7c945ad7833485e6babeb505a6944833b830eb7375" +content-hash = "22eb2ecfd0bdc6014c7ee3a73b62e2533b709251e442b4ae1053310afcd0cedd" diff --git a/pyproject.toml b/pyproject.toml index e315249..548658d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ httpx = "^0.28.0" curlify2 = "^2.0.0" requests = "^2.32.3" pillow = "^11.0.0" +python-slugify = "^8.0.4" [tool.poetry.group.dev.dependencies]