collect data and create command to publish to Gancio
This commit is contained in:
parent
d7f6b31b53
commit
6a95db8908
8 changed files with 515 additions and 12 deletions
0
data_collector/data_collector/commands/__init__.py
Normal file
0
data_collector/data_collector/commands/__init__.py
Normal file
20
data_collector/data_collector/commands/publish.py
Normal file
20
data_collector/data_collector/commands/publish.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from scrapy.commands import ScrapyCommand
|
||||
|
||||
from data_collector.models import Event, initialize_db
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
default_settings = {"LOG_ENABLED": False}
|
||||
|
||||
def short_desc(self) -> str:
|
||||
return "Publish events to Gancio"
|
||||
|
||||
def run(self, args, opts):
|
||||
db = initialize_db()
|
||||
|
||||
unpublished_events = Event.select().where(Event.published == False)
|
||||
for event in unpublished_events:
|
||||
event.publish()
|
||||
|
||||
db.close()
|
|
@ -1,3 +1,4 @@
|
|||
import httpx
|
||||
import peewee
|
||||
|
||||
from data_collector import settings
|
||||
|
@ -9,15 +10,47 @@ def get_db():
|
|||
|
||||
class Event(peewee.Model):
|
||||
title = peewee.CharField()
|
||||
start_datetime = peewee.DateTimeField()
|
||||
end_datetime = peewee.DateTimeField()
|
||||
start_timestamp = peewee.IntegerField()
|
||||
end_timestamp = peewee.IntegerField()
|
||||
description = peewee.TextField()
|
||||
url = peewee.CharField()
|
||||
location = peewee.CharField()
|
||||
processed = peewee.BooleanField()
|
||||
published = peewee.BooleanField()
|
||||
image_path = peewee.CharField()
|
||||
|
||||
class Meta:
|
||||
database = get_db()
|
||||
indexes = ((("title", "start_datetime", "end_datetime", "url"), True),)
|
||||
indexes = ((("title", "start_timestamp", "end_timestamp", "url"), True),)
|
||||
|
||||
def publish(self):
|
||||
payload = {
|
||||
"title": self.title,
|
||||
"start_datetime": self.start_timestamp,
|
||||
"end_datetime": self.end_timestamp,
|
||||
"description": self.description,
|
||||
}
|
||||
|
||||
place_id_by_location = {
|
||||
"Teatro Polytheama": (1, "polytheama"),
|
||||
"Sala Glória Rocha do Centro das Artes": (3, "gloria-rocha"),
|
||||
}
|
||||
place_id, tag = place_id_by_location.get(self.location) or (None, None)
|
||||
if place_id:
|
||||
payload["place_id"] = place_id
|
||||
payload["tags"] = ["jundiaí", tag]
|
||||
|
||||
with open(f"{settings.IMAGES_STORE}{self.image_path}", "rb") as image_file:
|
||||
files = {"image": image_file}
|
||||
response = httpx.post(settings.GANCIO_API_URL, data=payload, files=files)
|
||||
|
||||
if response.status_code == 200:
|
||||
self.published = True
|
||||
self.save()
|
||||
print(f"{self.title} published")
|
||||
else:
|
||||
print(
|
||||
f"Fail to publish {self.title} ({response.status_code} {response.text})"
|
||||
)
|
||||
|
||||
|
||||
def initialize_db():
|
||||
|
|
|
@ -1,28 +1,37 @@
|
|||
import re
|
||||
|
||||
import httpx
|
||||
import peewee
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
from data_collector import settings
|
||||
from data_collector.models import Event, initialize_db
|
||||
|
||||
|
||||
class DataCollectorPipeline:
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.db = initialize_db()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
image_path = item["images"][0]["path"] if item["images"] else None
|
||||
location = re.sub(r"\s", " ", item["location"])
|
||||
|
||||
event = Event(
|
||||
title=item["title"],
|
||||
start_datetime=item["start_datetime"],
|
||||
end_datetime=item["end_datetime"],
|
||||
start_timestamp=item["start_datetime"].timestamp(),
|
||||
end_timestamp=item["end_datetime"].timestamp(),
|
||||
description=item["description"],
|
||||
url=item["url"],
|
||||
location=item["location"],
|
||||
processed=False,
|
||||
location=location,
|
||||
published=False,
|
||||
image_path=image_path,
|
||||
)
|
||||
try:
|
||||
event.save()
|
||||
except peewee.IntegrityError:
|
||||
# Event was added before
|
||||
pass
|
||||
raise DropItem("Already scrapped before")
|
||||
|
||||
return item
|
||||
|
||||
|
|
|
@ -63,9 +63,14 @@ ROBOTSTXT_OBEY = False
|
|||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
"scrapy.pipelines.images.ImagesPipeline": 100,
|
||||
"data_collector.pipelines.DataCollectorPipeline": 300,
|
||||
}
|
||||
|
||||
IMAGES_STORE = "images/"
|
||||
|
||||
COMMANDS_MODULE = "data_collector.commands"
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
|
@ -92,3 +97,5 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
|||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
DATABASE = "agenda.db"
|
||||
|
||||
GANCIO_API_URL = "https://agenda.rocha.dev.br/api/event"
|
||||
|
|
|
@ -24,13 +24,33 @@ class CulturaJundiaiSpider(scrapy.Spider):
|
|||
end_time = date_search.groupdict()["end_time"]
|
||||
end_datetime = datetime.strptime(f"{date} {end_time}", "%d/%m/%Y %H:%M")
|
||||
|
||||
yield {
|
||||
event_data = {
|
||||
"title": event.css(".titulo-lista::text").get(),
|
||||
"start_datetime": start_datetime,
|
||||
"end_datetime": end_datetime,
|
||||
"url": event.css("a::attr(href)").get(),
|
||||
"location": event.css(".resumo-lista::text").get(),
|
||||
"tags": [
|
||||
"jundiaí",
|
||||
],
|
||||
}
|
||||
yield scrapy.Request(
|
||||
event_data["url"],
|
||||
callback=self.parse_event,
|
||||
cb_kwargs={"event_data": event_data},
|
||||
)
|
||||
|
||||
for page_url in response.css("#paginacao a::attr(href)").getall():
|
||||
yield scrapy.Request(page_url)
|
||||
|
||||
def parse_event(self, response, event_data):
|
||||
image = response.xpath(
|
||||
'//meta[@property="og:image" and @content != ""]/@content'
|
||||
).get()
|
||||
event_data["image_urls"] = [
|
||||
image,
|
||||
]
|
||||
|
||||
event_data["description"] = response.css("article").get()
|
||||
|
||||
yield event_data
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue