scrape all markdown from hedgedoc wiki

run via $ scrapy crawl pages
3 months ago · 2170046d2c
parent 202dfc2902
commit 2170046d2c
3 changed files with 20 additions and 1 deletions
--- a/hedgedoc_exporter/pipelines.py
+++ b/hedgedoc_exporter/pipelines.py
@ -4,10 +4,18 @@
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


+import re
+
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
+from scrapy.pipelines.files import FilesPipeline


 class HedgedocExporterPipeline:
    def process_item(self, item, spider):
        return item
+
+class PageFilesPipeline(FilesPipeline):
+    def file_path(self, request, response=None, info=None, *, item=None):
+        name = re.search(r'/(.*?)/download$', request.url).groups()[0]
+        return f'{name}.md'
--- a/hedgedoc_exporter/settings.py
+++ b/hedgedoc_exporter/settings.py
@ -11,6 +11,7 @@ BOT_NAME = "hedgedoc_exporter"

 SPIDER_MODULES = ["hedgedoc_exporter.spiders"]
 NEWSPIDER_MODULE = "hedgedoc_exporter.spiders"
+ITEM_PIPELINES = {"hedgedoc_exporter.pipelines.PageFilesPipeline": 1}


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
--- a/hedgedoc_exporter/spiders/pages.py
+++ b/hedgedoc_exporter/spiders/pages.py
@ -1,3 +1,5 @@
+import re
+
 import scrapy


@ -6,5 +8,13 @@ class PagesSpider(scrapy.Spider):
    allowed_domains = ["basement.woodbine.nyc"]
    start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]

+    def get_all_md_links(self, page_text):
+        return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
+
    def parse(self, response):
-        pass
+        yield {'file_urls': [response.url]}
+        for title, path in self.get_all_md_links(response.text):
+            if not path.startswith('/'):
+                continue
+            next_page = response.urljoin(path) + '/download'
+            yield scrapy.Request(next_page, callback=self.parse)