You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

21 lines
628 B

import re
import scrapy
class PagesSpider(scrapy.Spider):
name = "pages"
allowed_domains = ["basement.woodbine.nyc"]
start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]
def get_all_md_links(self, page_text):
return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
def parse(self, response):
yield {'file_urls': [response.url]}
for title, path in self.get_all_md_links(response.text):
if not path.startswith('/'):
continue
next_page = response.urljoin(path) + '/download'
yield scrapy.Request(next_page, callback=self.parse)