chiark / gitweb /
Cope with spaces in arch names as found on Hunter
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:36:42 +0000 (16:36 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:36:42 +0000 (16:36 +0100)
yarrg/yppedia-ocean-scraper

index 401de9b..9105f19 100755 (executable)
@@ -61,7 +61,7 @@ def fetch():
        soup = BeautifulSoup(dataf)
 
 
-title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$')
+title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$')
 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
 href_img_re = regexp.compile('\\.png$')
 
@@ -107,8 +107,8 @@ def parse():
                debug('links',links)
                if not links: continue
                (a,o) = title_arch_info(links[0]['title'])
+               debug('arch-ocean', (a,o))
                assert(o == ocean)
-               debug('arch', a)
                assert(a not in arches)
                isles = []
                for link in links[1:]: