Add Teatro Castro Mendes spider

This commit is contained in:
Renne Rocha 2024-12-13 21:58:20 -03:00
parent f9d7a5d09e
commit a4a9b17c2d
5 changed files with 127 additions and 1 deletions

View file

@ -17,6 +17,7 @@ class Event(peewee.Model):
location = peewee.CharField()
published = peewee.BooleanField()
image_path = peewee.CharField()
tags = peewee.TextField()
class Meta:
database = get_db()

View file

@ -16,6 +16,7 @@ class DataCollectorPipeline:
def process_item(self, item, spider):
image_path = item["images"][0]["path"] if item["images"] else None
location = re.sub(r"\s", " ", item["location"])
tags = item.get("tags") or []
event = Event(
title=item["title"],
@ -26,6 +27,7 @@ class DataCollectorPipeline:
location=location,
published=False,
image_path=image_path,
tags=",".join(tags),
)
try:
event.save()

View file

@ -0,0 +1,94 @@
import copy
import re
from datetime import datetime, timedelta
from urllib.parse import urlencode, urljoin
import scrapy
from slugify import slugify
API_KEY = "live_2d2837a14f7043dab07f23022cb4fa3c1034caf1d92c4b198c86402146924c08"
class CastroMendesSpider(scrapy.Spider):
name = "castro_mendes"
start_urls = [
f"https://api.teatrocastromendes.com.br/v1/home/card?apikey={API_KEY}"
]
def parse(self, response):
for event in response.json():
event_url = urljoin("https://teatrocastromendes.com.br", event["uri"])
event_data = {
"title": event["ds_evento"],
"url": event_url,
"location": event["ds_nome_teatro"],
"image_urls": [
event["img"],
],
"tags": [
"campinas",
"castro-mendes",
slugify(event["genreName"]),
],
}
encoded_params = urlencode(
{
"key": event["uri"].replace("/evento/", ""),
"apikey": API_KEY,
}
)
event_details_url = "https://api.teatrocastromendes.com.br/v1/event/get"
url = f"{event_details_url}?{encoded_params}"
yield scrapy.Request(
url,
callback=self.parse_event,
cb_kwargs={"event_data": event_data},
)
def parse_event(self, response, event_data):
data = response.json()
event_data["description"] = data["description"]
encoded_params = urlencode(
{
"id_base": data["id_base"],
"codPeca": data["CodPeca"],
"apikey": API_KEY,
}
)
event_presentations_url = (
"https://api.teatrocastromendes.com.br/v1/event/presentation"
)
url = f"{event_presentations_url}?{encoded_params}"
yield scrapy.Request(
url,
callback=self.parse_presentations,
cb_kwargs={"event_data": event_data},
)
def parse_presentations(self, response, event_data):
for presentation in response.json():
presentation_data = copy.deepcopy(event_data)
hour = presentation["HorSessao"]
day = presentation["day"]
year = presentation["year"]
start_datetime = datetime.strptime(f"{day}/{year} {hour}", "%d/%m/%Y %H:%M")
presentation_data["start_datetime"] = start_datetime
duration = 60 # If not able to find event duration in description, use 60min as default
duration_match = re.findall(
r"Duração do espetáculo: (\d+) minutos", event_data["description"]
)
if duration_match:
duration = int(duration_match.pop())
end_datetime = datetime.strptime(
f"{day}/{year} {hour}", "%d/%m/%Y %H:%M"
) + timedelta(minutes=duration)
presentation_data["end_datetime"] = start_datetime
yield presentation_data