You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
21 lines
628 B
21 lines
628 B
import re
|
|
|
|
import scrapy
|
|
|
|
|
|
class PagesSpider(scrapy.Spider):
|
|
name = "pages"
|
|
allowed_domains = ["basement.woodbine.nyc"]
|
|
start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]
|
|
|
|
def get_all_md_links(self, page_text):
|
|
return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
|
|
|
|
def parse(self, response):
|
|
yield {'file_urls': [response.url]}
|
|
for title, path in self.get_all_md_links(response.text):
|
|
if not path.startswith('/'):
|
|
continue
|
|
next_page = response.urljoin(path) + '/download'
|
|
yield scrapy.Request(next_page, callback=self.parse)
|