collect data and create command to publish to Gancio

2024-12-13 20:39:25 -03:00 · 2024-12-13 20:39:25 -03:00 · 6a95db8908
commit 6a95db8908
parent d7f6b31b53
8 changed files with 515 additions and 12 deletions
--- a/data_collector/data_collector/commands/init.py
+++ b/data_collector/data_collector/commands/init.py
--- a/data_collector/data_collector/commands/publish.py
+++ b/data_collector/data_collector/commands/publish.py
@ -0,0 +1,20 @@
+from scrapy.commands import ScrapyCommand
+
+from data_collector.models import Event, initialize_db
+
+
+class Command(ScrapyCommand):
+    requires_project = True
+    default_settings = {"LOG_ENABLED": False}
+
+    def short_desc(self) -> str:
+        return "Publish events to Gancio"
+
+    def run(self, args, opts):
+        db = initialize_db()
+
+        unpublished_events = Event.select().where(Event.published == False)
+        for event in unpublished_events:
+            event.publish()
+
+        db.close()
--- a/data_collector/data_collector/models.py
+++ b/data_collector/data_collector/models.py
@ -1,3 +1,4 @@
+import httpx
 import peewee

 from data_collector import settings
@ -9,15 +10,47 @@ def get_db():

 class Event(peewee.Model):
    title = peewee.CharField()
-    start_datetime = peewee.DateTimeField()
-    end_datetime = peewee.DateTimeField()
+    start_timestamp = peewee.IntegerField()
+    end_timestamp = peewee.IntegerField()
+    description = peewee.TextField()
    url = peewee.CharField()
    location = peewee.CharField()
-    processed = peewee.BooleanField()
+    published = peewee.BooleanField()
+    image_path = peewee.CharField()

    class Meta:
        database = get_db()
-        indexes = ((("title", "start_datetime", "end_datetime", "url"), True),)
+        indexes = ((("title", "start_timestamp", "end_timestamp", "url"), True),)
+
+    def publish(self):
+        payload = {
+            "title": self.title,
+            "start_datetime": self.start_timestamp,
+            "end_datetime": self.end_timestamp,
+            "description": self.description,
+        }
+
+        place_id_by_location = {
+            "Teatro Polytheama": (1, "polytheama"),
+            "Sala Glória Rocha do Centro das Artes": (3, "gloria-rocha"),
+        }
+        place_id, tag = place_id_by_location.get(self.location) or (None, None)
+        if place_id:
+            payload["place_id"] = place_id
+            payload["tags"] = ["jundiaí", tag]
+
+        with open(f"{settings.IMAGES_STORE}{self.image_path}", "rb") as image_file:
+            files = {"image": image_file}
+            response = httpx.post(settings.GANCIO_API_URL, data=payload, files=files)
+
+            if response.status_code == 200:
+                self.published = True
+                self.save()
+                print(f"{self.title} published")
+            else:
+                print(
+                    f"Fail to publish {self.title} ({response.status_code} {response.text})"
+                )


 def initialize_db():
--- a/data_collector/data_collector/pipelines.py
+++ b/data_collector/data_collector/pipelines.py
@ -1,28 +1,37 @@
+import re
+
+import httpx
 import peewee
 from itemadapter import ItemAdapter
+from scrapy.exceptions import DropItem

+from data_collector import settings
 from data_collector.models import Event, initialize_db


 class DataCollectorPipeline:
-
    def open_spider(self, spider):
        self.db = initialize_db()

    def process_item(self, item, spider):
+        image_path = item["images"][0]["path"] if item["images"] else None
+        location = re.sub(r"\s", " ", item["location"])
+
        event = Event(
            title=item["title"],
-            start_datetime=item["start_datetime"],
-            end_datetime=item["end_datetime"],
+            start_timestamp=item["start_datetime"].timestamp(),
+            end_timestamp=item["end_datetime"].timestamp(),
+            description=item["description"],
            url=item["url"],
-            location=item["location"],
-            processed=False,
+            location=location,
+            published=False,
+            image_path=image_path,
        )
        try:
            event.save()
        except peewee.IntegrityError:
            # Event was added before
-            pass
+            raise DropItem("Already scrapped before")

        return item

--- a/data_collector/data_collector/settings.py
+++ b/data_collector/data_collector/settings.py
@ -63,9 +63,14 @@ ROBOTSTXT_OBEY = False
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
+    "scrapy.pipelines.images.ImagesPipeline": 100,
    "data_collector.pipelines.DataCollectorPipeline": 300,
 }

+IMAGES_STORE = "images/"
+
+COMMANDS_MODULE = "data_collector.commands"
+
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 # AUTOTHROTTLE_ENABLED = True
@ -92,3 +97,5 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"

 DATABASE = "agenda.db"
+
+GANCIO_API_URL = "https://agenda.rocha.dev.br/api/event"
--- a/data_collector/data_collector/spiders/cultura_jundiai.py
+++ b/data_collector/data_collector/spiders/cultura_jundiai.py
@ -24,13 +24,33 @@ class CulturaJundiaiSpider(scrapy.Spider):
                end_time = date_search.groupdict()["end_time"]
                end_datetime = datetime.strptime(f"{date} {end_time}", "%d/%m/%Y %H:%M")

-                yield {
+                event_data = {
                    "title": event.css(".titulo-lista::text").get(),
                    "start_datetime": start_datetime,
                    "end_datetime": end_datetime,
                    "url": event.css("a::attr(href)").get(),
                    "location": event.css(".resumo-lista::text").get(),
+                    "tags": [
+                        "jundiaí",
+                    ],
                }
+                yield scrapy.Request(
+                    event_data["url"],
+                    callback=self.parse_event,
+                    cb_kwargs={"event_data": event_data},
+                )

        for page_url in response.css("#paginacao a::attr(href)").getall():
            yield scrapy.Request(page_url)
+
+    def parse_event(self, response, event_data):
+        image = response.xpath(
+            '//meta[@property="og:image" and @content != ""]/@content'
+        ).get()
+        event_data["image_urls"] = [
+            image,
+        ]
+
+        event_data["description"] = response.css("article").get()
+
+        yield event_data