import re import scrapy class PagesSpider(scrapy.Spider): name = "pages" allowed_domains = ["basement.woodbine.nyc"] start_urls = ["https://basement.woodbine.nyc/s/Main_Page"] def get_all_md_links(self, page_text): return re.findall(r'\[(.*?)\]\((.*?)\)', page_text) def parse(self, response): yield {'file_urls': [response.url]} for title, path in self.get_all_md_links(response.text): if not path.startswith('/'): continue next_page = response.urljoin(path) + '/download' yield scrapy.Request(next_page, callback=self.parse)