chiark / gitweb /
yarrg database: when scraping yppedia charts, allow links to islands whose pages...
authorIan Jackson <ijackson@chiark.greenend.org.uk>
Fri, 13 Jan 2012 00:56:37 +0000 (00:56 +0000)
committerIan Jackson <ijackson@chiark.greenend.org.uk>
Fri, 13 Jan 2012 00:56:37 +0000 (00:56 +0000)
yarrg/yppedia-ocean-scraper

index ba145eafcc247b6bae90d13244b159d62570928d..5e5e0901f563b097b6554b891e04cf184f89b45a 100755 (executable)
@@ -99,7 +99,7 @@ def fetch():
        soup = BeautifulSoup(dataf)
 
 title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$')
-title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
+title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)(?: \(page does not exist\))?$')
 href_img_re = regexp.compile('\\.png$')
 
 def title_arch_info(t):