From 2170046d2c0c80e0e6c821dc55f5d3e801e74e60 Mon Sep 17 00:00:00 2001 From: Paul Feitzinger Date: Fri, 24 Jan 2025 15:24:47 -0500 Subject: [PATCH] scrape all markdown from hedgedoc wiki run via $ scrapy crawl pages --- hedgedoc_exporter/pipelines.py | 8 ++++++++ hedgedoc_exporter/settings.py | 1 + hedgedoc_exporter/spiders/pages.py | 12 +++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/hedgedoc_exporter/pipelines.py b/hedgedoc_exporter/pipelines.py index e00ff10..2c405b7 100644 --- a/hedgedoc_exporter/pipelines.py +++ b/hedgedoc_exporter/pipelines.py @@ -4,10 +4,18 @@ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +import re + # useful for handling different item types with a single interface from itemadapter import ItemAdapter +from scrapy.pipelines.files import FilesPipeline class HedgedocExporterPipeline: def process_item(self, item, spider): return item + +class PageFilesPipeline(FilesPipeline): + def file_path(self, request, response=None, info=None, *, item=None): + name = re.search(r'/(.*?)/download$', request.url).groups()[0] + return f'{name}.md' diff --git a/hedgedoc_exporter/settings.py b/hedgedoc_exporter/settings.py index 1cf0dd0..f3b2ee1 100644 --- a/hedgedoc_exporter/settings.py +++ b/hedgedoc_exporter/settings.py @@ -11,6 +11,7 @@ BOT_NAME = "hedgedoc_exporter" SPIDER_MODULES = ["hedgedoc_exporter.spiders"] NEWSPIDER_MODULE = "hedgedoc_exporter.spiders" +ITEM_PIPELINES = {"hedgedoc_exporter.pipelines.PageFilesPipeline": 1} # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/hedgedoc_exporter/spiders/pages.py b/hedgedoc_exporter/spiders/pages.py index 7e7f20d..165c4fb 100644 --- a/hedgedoc_exporter/spiders/pages.py +++ b/hedgedoc_exporter/spiders/pages.py @@ -1,3 +1,5 @@ +import re + import scrapy @@ -6,5 +8,13 @@ class PagesSpider(scrapy.Spider): allowed_domains = ["basement.woodbine.nyc"] start_urls = ["https://basement.woodbine.nyc/s/Main_Page"] + def get_all_md_links(self, page_text): + return re.findall(r'\[(.*?)\]\((.*?)\)', page_text) + def parse(self, response): - pass + yield {'file_urls': [response.url]} + for title, path in self.get_all_md_links(response.text): + if not path.startswith('/'): + continue + next_page = response.urljoin(path) + '/download' + yield scrapy.Request(next_page, callback=self.parse)