scrape all markdown from hedgedoc wiki

run via
$ scrapy crawl pages
main
Paul Feitzinger 3 months ago
parent 202dfc2902
commit 2170046d2c

@ -4,10 +4,18 @@
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import re
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.files import FilesPipeline
class HedgedocExporterPipeline:
def process_item(self, item, spider):
return item
class PageFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
name = re.search(r'/(.*?)/download$', request.url).groups()[0]
return f'{name}.md'

@ -11,6 +11,7 @@ BOT_NAME = "hedgedoc_exporter"
SPIDER_MODULES = ["hedgedoc_exporter.spiders"]
NEWSPIDER_MODULE = "hedgedoc_exporter.spiders"
ITEM_PIPELINES = {"hedgedoc_exporter.pipelines.PageFilesPipeline": 1}
# Crawl responsibly by identifying yourself (and your website) on the user-agent

@ -1,3 +1,5 @@
import re
import scrapy
@ -6,5 +8,13 @@ class PagesSpider(scrapy.Spider):
allowed_domains = ["basement.woodbine.nyc"]
start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]
def get_all_md_links(self, page_text):
return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
def parse(self, response):
pass
yield {'file_urls': [response.url]}
for title, path in self.get_all_md_links(response.text):
if not path.startswith('/'):
continue
next_page = response.urljoin(path) + '/download'
yield scrapy.Request(next_page, callback=self.parse)

Loading…
Cancel
Save