From 2170046d2c0c80e0e6c821dc55f5d3e801e74e60 Mon Sep 17 00:00:00 2001
From: Paul Feitzinger <paul@pfeyz.com>
Date: Fri, 24 Jan 2025 15:24:47 -0500
Subject: [PATCH] scrape all markdown from hedgedoc wiki

run via
$ scrapy crawl pages
---
 hedgedoc_exporter/pipelines.py     |  8 ++++++++
 hedgedoc_exporter/settings.py      |  1 +
 hedgedoc_exporter/spiders/pages.py | 12 +++++++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/hedgedoc_exporter/pipelines.py b/hedgedoc_exporter/pipelines.py
index e00ff10..2c405b7 100644
--- a/hedgedoc_exporter/pipelines.py
+++ b/hedgedoc_exporter/pipelines.py
@@ -4,10 +4,18 @@
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 
 
+import re
+
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
+from scrapy.pipelines.files import FilesPipeline
 
 
 class HedgedocExporterPipeline:
     def process_item(self, item, spider):
         return item
+
+class PageFilesPipeline(FilesPipeline):
+    def file_path(self, request, response=None, info=None, *, item=None):
+        name = re.search(r'/(.*?)/download$', request.url).groups()[0]
+        return f'{name}.md'
diff --git a/hedgedoc_exporter/settings.py b/hedgedoc_exporter/settings.py
index 1cf0dd0..f3b2ee1 100644
--- a/hedgedoc_exporter/settings.py
+++ b/hedgedoc_exporter/settings.py
@@ -11,6 +11,7 @@ BOT_NAME = "hedgedoc_exporter"
 
 SPIDER_MODULES = ["hedgedoc_exporter.spiders"]
 NEWSPIDER_MODULE = "hedgedoc_exporter.spiders"
+ITEM_PIPELINES = {"hedgedoc_exporter.pipelines.PageFilesPipeline": 1}
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
diff --git a/hedgedoc_exporter/spiders/pages.py b/hedgedoc_exporter/spiders/pages.py
index 7e7f20d..165c4fb 100644
--- a/hedgedoc_exporter/spiders/pages.py
+++ b/hedgedoc_exporter/spiders/pages.py
@@ -1,3 +1,5 @@
+import re
+
 import scrapy
 
 
@@ -6,5 +8,13 @@ class PagesSpider(scrapy.Spider):
     allowed_domains = ["basement.woodbine.nyc"]
     start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]
 
+    def get_all_md_links(self, page_text):
+        return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
+
     def parse(self, response):
-        pass
+        yield {'file_urls': [response.url]}
+        for title, path in self.get_all_md_links(response.text):
+            if not path.startswith('/'):
+                continue
+            next_page = response.urljoin(path) + '/download'
+            yield scrapy.Request(next_page, callback=self.parse)