collect data and create command to publish to Gancio

This commit is contained in:
Renne Rocha 2024-12-13 20:39:25 -03:00
parent d7f6b31b53
commit 6a95db8908
8 changed files with 515 additions and 12 deletions

View file

@ -0,0 +1,20 @@
from scrapy.commands import ScrapyCommand
from data_collector.models import Event, initialize_db
class Command(ScrapyCommand):
requires_project = True
default_settings = {"LOG_ENABLED": False}
def short_desc(self) -> str:
return "Publish events to Gancio"
def run(self, args, opts):
db = initialize_db()
unpublished_events = Event.select().where(Event.published == False)
for event in unpublished_events:
event.publish()
db.close()

View file

@ -1,3 +1,4 @@
import httpx
import peewee
from data_collector import settings
@ -9,15 +10,47 @@ def get_db():
class Event(peewee.Model):
title = peewee.CharField()
start_datetime = peewee.DateTimeField()
end_datetime = peewee.DateTimeField()
start_timestamp = peewee.IntegerField()
end_timestamp = peewee.IntegerField()
description = peewee.TextField()
url = peewee.CharField()
location = peewee.CharField()
processed = peewee.BooleanField()
published = peewee.BooleanField()
image_path = peewee.CharField()
class Meta:
database = get_db()
indexes = ((("title", "start_datetime", "end_datetime", "url"), True),)
indexes = ((("title", "start_timestamp", "end_timestamp", "url"), True),)
def publish(self):
payload = {
"title": self.title,
"start_datetime": self.start_timestamp,
"end_datetime": self.end_timestamp,
"description": self.description,
}
place_id_by_location = {
"Teatro Polytheama": (1, "polytheama"),
"Sala Glória Rocha do Centro das Artes": (3, "gloria-rocha"),
}
place_id, tag = place_id_by_location.get(self.location) or (None, None)
if place_id:
payload["place_id"] = place_id
payload["tags"] = ["jundiaí", tag]
with open(f"{settings.IMAGES_STORE}{self.image_path}", "rb") as image_file:
files = {"image": image_file}
response = httpx.post(settings.GANCIO_API_URL, data=payload, files=files)
if response.status_code == 200:
self.published = True
self.save()
print(f"{self.title} published")
else:
print(
f"Fail to publish {self.title} ({response.status_code} {response.text})"
)
def initialize_db():

View file

@ -1,28 +1,37 @@
import re
import httpx
import peewee
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from data_collector import settings
from data_collector.models import Event, initialize_db
class DataCollectorPipeline:
def open_spider(self, spider):
self.db = initialize_db()
def process_item(self, item, spider):
image_path = item["images"][0]["path"] if item["images"] else None
location = re.sub(r"\s", " ", item["location"])
event = Event(
title=item["title"],
start_datetime=item["start_datetime"],
end_datetime=item["end_datetime"],
start_timestamp=item["start_datetime"].timestamp(),
end_timestamp=item["end_datetime"].timestamp(),
description=item["description"],
url=item["url"],
location=item["location"],
processed=False,
location=location,
published=False,
image_path=image_path,
)
try:
event.save()
except peewee.IntegrityError:
# Event was added before
pass
raise DropItem("Already scrapped before")
return item

View file

@ -63,9 +63,14 @@ ROBOTSTXT_OBEY = False
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"scrapy.pipelines.images.ImagesPipeline": 100,
"data_collector.pipelines.DataCollectorPipeline": 300,
}
IMAGES_STORE = "images/"
COMMANDS_MODULE = "data_collector.commands"
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
@ -92,3 +97,5 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
DATABASE = "agenda.db"
GANCIO_API_URL = "https://agenda.rocha.dev.br/api/event"

View file

@ -24,13 +24,33 @@ class CulturaJundiaiSpider(scrapy.Spider):
end_time = date_search.groupdict()["end_time"]
end_datetime = datetime.strptime(f"{date} {end_time}", "%d/%m/%Y %H:%M")
yield {
event_data = {
"title": event.css(".titulo-lista::text").get(),
"start_datetime": start_datetime,
"end_datetime": end_datetime,
"url": event.css("a::attr(href)").get(),
"location": event.css(".resumo-lista::text").get(),
"tags": [
"jundiaí",
],
}
yield scrapy.Request(
event_data["url"],
callback=self.parse_event,
cb_kwargs={"event_data": event_data},
)
for page_url in response.css("#paginacao a::attr(href)").getall():
yield scrapy.Request(page_url)
def parse_event(self, response, event_data):
image = response.xpath(
'//meta[@property="og:image" and @content != ""]/@content'
).get()
event_data["image_urls"] = [
image,
]
event_data["description"] = response.css("article").get()
yield event_data