|
|
|
@ -1,3 +1,5 @@
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
import scrapy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -6,5 +8,13 @@ class PagesSpider(scrapy.Spider):
|
|
|
|
|
allowed_domains = ["basement.woodbine.nyc"]
|
|
|
|
|
start_urls = ["https://basement.woodbine.nyc/s/Main_Page"]
|
|
|
|
|
|
|
|
|
|
def get_all_md_links(self, page_text):
|
|
|
|
|
return re.findall(r'\[(.*?)\]\((.*?)\)', page_text)
|
|
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
pass
|
|
|
|
|
yield {'file_urls': [response.url]}
|
|
|
|
|
for title, path in self.get_all_md_links(response.text):
|
|
|
|
|
if not path.startswith('/'):
|
|
|
|
|
continue
|
|
|
|
|
next_page = response.urljoin(path) + '/download'
|
|
|
|
|
yield scrapy.Request(next_page, callback=self.parse)
|
|
|
|
|