Add Teatro Castro Mendes spider

This commit is contained in:
Renne Rocha 2024-12-13 21:58:20 -03:00
parent f9d7a5d09e
commit a4a9b17c2d
5 changed files with 127 additions and 1 deletions

View file

@ -17,6 +17,7 @@ class Event(peewee.Model):
location = peewee.CharField()
published = peewee.BooleanField()
image_path = peewee.CharField()
tags = peewee.TextField()
class Meta:
database = get_db()

View file

@ -16,6 +16,7 @@ class DataCollectorPipeline:
def process_item(self, item, spider):
image_path = item["images"][0]["path"] if item["images"] else None
location = re.sub(r"\s", " ", item["location"])
tags = item.get("tags") or []
event = Event(
title=item["title"],
@ -26,6 +27,7 @@ class DataCollectorPipeline:
location=location,
published=False,
image_path=image_path,
tags=",".join(tags),
)
try:
event.save()

View file

@ -0,0 +1,94 @@
import copy
import re
from datetime import datetime, timedelta
from urllib.parse import urlencode, urljoin
import scrapy
from slugify import slugify
API_KEY = "live_2d2837a14f7043dab07f23022cb4fa3c1034caf1d92c4b198c86402146924c08"
class CastroMendesSpider(scrapy.Spider):
name = "castro_mendes"
start_urls = [
f"https://api.teatrocastromendes.com.br/v1/home/card?apikey={API_KEY}"
]
def parse(self, response):
for event in response.json():
event_url = urljoin("https://teatrocastromendes.com.br", event["uri"])
event_data = {
"title": event["ds_evento"],
"url": event_url,
"location": event["ds_nome_teatro"],
"image_urls": [
event["img"],
],
"tags": [
"campinas",
"castro-mendes",
slugify(event["genreName"]),
],
}
encoded_params = urlencode(
{
"key": event["uri"].replace("/evento/", ""),
"apikey": API_KEY,
}
)
event_details_url = "https://api.teatrocastromendes.com.br/v1/event/get"
url = f"{event_details_url}?{encoded_params}"
yield scrapy.Request(
url,
callback=self.parse_event,
cb_kwargs={"event_data": event_data},
)
def parse_event(self, response, event_data):
data = response.json()
event_data["description"] = data["description"]
encoded_params = urlencode(
{
"id_base": data["id_base"],
"codPeca": data["CodPeca"],
"apikey": API_KEY,
}
)
event_presentations_url = (
"https://api.teatrocastromendes.com.br/v1/event/presentation"
)
url = f"{event_presentations_url}?{encoded_params}"
yield scrapy.Request(
url,
callback=self.parse_presentations,
cb_kwargs={"event_data": event_data},
)
def parse_presentations(self, response, event_data):
for presentation in response.json():
presentation_data = copy.deepcopy(event_data)
hour = presentation["HorSessao"]
day = presentation["day"]
year = presentation["year"]
start_datetime = datetime.strptime(f"{day}/{year} {hour}", "%d/%m/%Y %H:%M")
presentation_data["start_datetime"] = start_datetime
duration = 60 # If not able to find event duration in description, use 60min as default
duration_match = re.findall(
r"Duração do espetáculo: (\d+) minutos", event_data["description"]
)
if duration_match:
duration = int(duration_match.pop())
end_datetime = datetime.strptime(
f"{day}/{year} {hour}", "%d/%m/%Y %H:%M"
) + timedelta(minutes=duration)
presentation_data["end_datetime"] = start_datetime
yield presentation_data

30
poetry.lock generated
View file

@ -1445,6 +1445,23 @@ files = [
{file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
]
[[package]]
name = "python-slugify"
version = "8.0.4"
description = "A Python slugify application that also handles Unicode"
optional = false
python-versions = ">=3.7"
files = [
{file = "python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856"},
{file = "python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8"},
]
[package.dependencies]
text-unidecode = ">=1.3"
[package.extras]
unidecode = ["Unidecode (>=1.1.1)"]
[[package]]
name = "pyyaml"
version = "6.0.2"
@ -1691,6 +1708,17 @@ pure-eval = "*"
[package.extras]
tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
[[package]]
name = "text-unidecode"
version = "1.3"
description = "The most basic Text::Unidecode port"
optional = false
python-versions = "*"
files = [
{file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"},
{file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"},
]
[[package]]
name = "tldextract"
version = "5.1.3"
@ -1889,4 +1917,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.13"
content-hash = "4ea6ce1ed26377f323e9ce7c945ad7833485e6babeb505a6944833b830eb7375"
content-hash = "22eb2ecfd0bdc6014c7ee3a73b62e2533b709251e442b4ae1053310afcd0cedd"

View file

@ -15,6 +15,7 @@ httpx = "^0.28.0"
curlify2 = "^2.0.0"
requests = "^2.32.3"
pillow = "^11.0.0"
python-slugify = "^8.0.4"
[tool.poetry.group.dev.dependencies]