Add Teatro Castro Mendes spider
This commit is contained in:
parent
f9d7a5d09e
commit
a4a9b17c2d
5 changed files with 127 additions and 1 deletions
|
@ -17,6 +17,7 @@ class Event(peewee.Model):
|
|||
location = peewee.CharField()
|
||||
published = peewee.BooleanField()
|
||||
image_path = peewee.CharField()
|
||||
tags = peewee.TextField()
|
||||
|
||||
class Meta:
|
||||
database = get_db()
|
||||
|
|
|
@ -16,6 +16,7 @@ class DataCollectorPipeline:
|
|||
def process_item(self, item, spider):
|
||||
image_path = item["images"][0]["path"] if item["images"] else None
|
||||
location = re.sub(r"\s", " ", item["location"])
|
||||
tags = item.get("tags") or []
|
||||
|
||||
event = Event(
|
||||
title=item["title"],
|
||||
|
@ -26,6 +27,7 @@ class DataCollectorPipeline:
|
|||
location=location,
|
||||
published=False,
|
||||
image_path=image_path,
|
||||
tags=",".join(tags),
|
||||
)
|
||||
try:
|
||||
event.save()
|
||||
|
|
94
data_collector/data_collector/spiders/castro_mendes.py
Normal file
94
data_collector/data_collector/spiders/castro_mendes.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
import copy
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlencode, urljoin
|
||||
|
||||
import scrapy
|
||||
from slugify import slugify
|
||||
|
||||
API_KEY = "live_2d2837a14f7043dab07f23022cb4fa3c1034caf1d92c4b198c86402146924c08"
|
||||
|
||||
|
||||
class CastroMendesSpider(scrapy.Spider):
|
||||
name = "castro_mendes"
|
||||
start_urls = [
|
||||
f"https://api.teatrocastromendes.com.br/v1/home/card?apikey={API_KEY}"
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
for event in response.json():
|
||||
event_url = urljoin("https://teatrocastromendes.com.br", event["uri"])
|
||||
event_data = {
|
||||
"title": event["ds_evento"],
|
||||
"url": event_url,
|
||||
"location": event["ds_nome_teatro"],
|
||||
"image_urls": [
|
||||
event["img"],
|
||||
],
|
||||
"tags": [
|
||||
"campinas",
|
||||
"castro-mendes",
|
||||
slugify(event["genreName"]),
|
||||
],
|
||||
}
|
||||
|
||||
encoded_params = urlencode(
|
||||
{
|
||||
"key": event["uri"].replace("/evento/", ""),
|
||||
"apikey": API_KEY,
|
||||
}
|
||||
)
|
||||
event_details_url = "https://api.teatrocastromendes.com.br/v1/event/get"
|
||||
url = f"{event_details_url}?{encoded_params}"
|
||||
|
||||
yield scrapy.Request(
|
||||
url,
|
||||
callback=self.parse_event,
|
||||
cb_kwargs={"event_data": event_data},
|
||||
)
|
||||
|
||||
def parse_event(self, response, event_data):
|
||||
data = response.json()
|
||||
event_data["description"] = data["description"]
|
||||
|
||||
encoded_params = urlencode(
|
||||
{
|
||||
"id_base": data["id_base"],
|
||||
"codPeca": data["CodPeca"],
|
||||
"apikey": API_KEY,
|
||||
}
|
||||
)
|
||||
event_presentations_url = (
|
||||
"https://api.teatrocastromendes.com.br/v1/event/presentation"
|
||||
)
|
||||
url = f"{event_presentations_url}?{encoded_params}"
|
||||
|
||||
yield scrapy.Request(
|
||||
url,
|
||||
callback=self.parse_presentations,
|
||||
cb_kwargs={"event_data": event_data},
|
||||
)
|
||||
|
||||
def parse_presentations(self, response, event_data):
|
||||
for presentation in response.json():
|
||||
presentation_data = copy.deepcopy(event_data)
|
||||
|
||||
hour = presentation["HorSessao"]
|
||||
day = presentation["day"]
|
||||
year = presentation["year"]
|
||||
|
||||
start_datetime = datetime.strptime(f"{day}/{year} {hour}", "%d/%m/%Y %H:%M")
|
||||
presentation_data["start_datetime"] = start_datetime
|
||||
|
||||
duration = 60 # If not able to find event duration in description, use 60min as default
|
||||
duration_match = re.findall(
|
||||
r"Duração do espetáculo: (\d+) minutos", event_data["description"]
|
||||
)
|
||||
if duration_match:
|
||||
duration = int(duration_match.pop())
|
||||
end_datetime = datetime.strptime(
|
||||
f"{day}/{year} {hour}", "%d/%m/%Y %H:%M"
|
||||
) + timedelta(minutes=duration)
|
||||
presentation_data["end_datetime"] = start_datetime
|
||||
|
||||
yield presentation_data
|
Loading…
Add table
Add a link
Reference in a new issue