Add Teatro Castro Mendes spider
This commit is contained in:
parent
f9d7a5d09e
commit
a4a9b17c2d
5 changed files with 127 additions and 1 deletions
|
@ -17,6 +17,7 @@ class Event(peewee.Model):
|
||||||
location = peewee.CharField()
|
location = peewee.CharField()
|
||||||
published = peewee.BooleanField()
|
published = peewee.BooleanField()
|
||||||
image_path = peewee.CharField()
|
image_path = peewee.CharField()
|
||||||
|
tags = peewee.TextField()
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
database = get_db()
|
database = get_db()
|
||||||
|
|
|
@ -16,6 +16,7 @@ class DataCollectorPipeline:
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
image_path = item["images"][0]["path"] if item["images"] else None
|
image_path = item["images"][0]["path"] if item["images"] else None
|
||||||
location = re.sub(r"\s", " ", item["location"])
|
location = re.sub(r"\s", " ", item["location"])
|
||||||
|
tags = item.get("tags") or []
|
||||||
|
|
||||||
event = Event(
|
event = Event(
|
||||||
title=item["title"],
|
title=item["title"],
|
||||||
|
@ -26,6 +27,7 @@ class DataCollectorPipeline:
|
||||||
location=location,
|
location=location,
|
||||||
published=False,
|
published=False,
|
||||||
image_path=image_path,
|
image_path=image_path,
|
||||||
|
tags=",".join(tags),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
event.save()
|
event.save()
|
||||||
|
|
94
data_collector/data_collector/spiders/castro_mendes.py
Normal file
94
data_collector/data_collector/spiders/castro_mendes.py
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
import copy
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from urllib.parse import urlencode, urljoin
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
API_KEY = "live_2d2837a14f7043dab07f23022cb4fa3c1034caf1d92c4b198c86402146924c08"
|
||||||
|
|
||||||
|
|
||||||
|
class CastroMendesSpider(scrapy.Spider):
|
||||||
|
name = "castro_mendes"
|
||||||
|
start_urls = [
|
||||||
|
f"https://api.teatrocastromendes.com.br/v1/home/card?apikey={API_KEY}"
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for event in response.json():
|
||||||
|
event_url = urljoin("https://teatrocastromendes.com.br", event["uri"])
|
||||||
|
event_data = {
|
||||||
|
"title": event["ds_evento"],
|
||||||
|
"url": event_url,
|
||||||
|
"location": event["ds_nome_teatro"],
|
||||||
|
"image_urls": [
|
||||||
|
event["img"],
|
||||||
|
],
|
||||||
|
"tags": [
|
||||||
|
"campinas",
|
||||||
|
"castro-mendes",
|
||||||
|
slugify(event["genreName"]),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
encoded_params = urlencode(
|
||||||
|
{
|
||||||
|
"key": event["uri"].replace("/evento/", ""),
|
||||||
|
"apikey": API_KEY,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
event_details_url = "https://api.teatrocastromendes.com.br/v1/event/get"
|
||||||
|
url = f"{event_details_url}?{encoded_params}"
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url,
|
||||||
|
callback=self.parse_event,
|
||||||
|
cb_kwargs={"event_data": event_data},
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_event(self, response, event_data):
|
||||||
|
data = response.json()
|
||||||
|
event_data["description"] = data["description"]
|
||||||
|
|
||||||
|
encoded_params = urlencode(
|
||||||
|
{
|
||||||
|
"id_base": data["id_base"],
|
||||||
|
"codPeca": data["CodPeca"],
|
||||||
|
"apikey": API_KEY,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
event_presentations_url = (
|
||||||
|
"https://api.teatrocastromendes.com.br/v1/event/presentation"
|
||||||
|
)
|
||||||
|
url = f"{event_presentations_url}?{encoded_params}"
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url,
|
||||||
|
callback=self.parse_presentations,
|
||||||
|
cb_kwargs={"event_data": event_data},
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_presentations(self, response, event_data):
|
||||||
|
for presentation in response.json():
|
||||||
|
presentation_data = copy.deepcopy(event_data)
|
||||||
|
|
||||||
|
hour = presentation["HorSessao"]
|
||||||
|
day = presentation["day"]
|
||||||
|
year = presentation["year"]
|
||||||
|
|
||||||
|
start_datetime = datetime.strptime(f"{day}/{year} {hour}", "%d/%m/%Y %H:%M")
|
||||||
|
presentation_data["start_datetime"] = start_datetime
|
||||||
|
|
||||||
|
duration = 60 # If not able to find event duration in description, use 60min as default
|
||||||
|
duration_match = re.findall(
|
||||||
|
r"Duração do espetáculo: (\d+) minutos", event_data["description"]
|
||||||
|
)
|
||||||
|
if duration_match:
|
||||||
|
duration = int(duration_match.pop())
|
||||||
|
end_datetime = datetime.strptime(
|
||||||
|
f"{day}/{year} {hour}", "%d/%m/%Y %H:%M"
|
||||||
|
) + timedelta(minutes=duration)
|
||||||
|
presentation_data["end_datetime"] = start_datetime
|
||||||
|
|
||||||
|
yield presentation_data
|
30
poetry.lock
generated
30
poetry.lock
generated
|
@ -1445,6 +1445,23 @@ files = [
|
||||||
{file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
|
{file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-slugify"
|
||||||
|
version = "8.0.4"
|
||||||
|
description = "A Python slugify application that also handles Unicode"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856"},
|
||||||
|
{file = "python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
text-unidecode = ">=1.3"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
unidecode = ["Unidecode (>=1.1.1)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "6.0.2"
|
version = "6.0.2"
|
||||||
|
@ -1691,6 +1708,17 @@ pure-eval = "*"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
|
tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "text-unidecode"
|
||||||
|
version = "1.3"
|
||||||
|
description = "The most basic Text::Unidecode port"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"},
|
||||||
|
{file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tldextract"
|
name = "tldextract"
|
||||||
version = "5.1.3"
|
version = "5.1.3"
|
||||||
|
@ -1889,4 +1917,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.13"
|
python-versions = "^3.13"
|
||||||
content-hash = "4ea6ce1ed26377f323e9ce7c945ad7833485e6babeb505a6944833b830eb7375"
|
content-hash = "22eb2ecfd0bdc6014c7ee3a73b62e2533b709251e442b4ae1053310afcd0cedd"
|
||||||
|
|
|
@ -15,6 +15,7 @@ httpx = "^0.28.0"
|
||||||
curlify2 = "^2.0.0"
|
curlify2 = "^2.0.0"
|
||||||
requests = "^2.32.3"
|
requests = "^2.32.3"
|
||||||
pillow = "^11.0.0"
|
pillow = "^11.0.0"
|
||||||
|
python-slugify = "^8.0.4"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue