diff options
Diffstat (limited to 'gg_scrapper.py')
-rwxr-xr-x | gg_scrapper.py | 61 |
1 files changed, 56 insertions, 5 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py index 59b81bc..650810c 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -1,13 +1,25 @@ #!/usr/bin/python +import re import urllib2 from bs4 import BeautifulSoup import logging logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) +TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$') -class GooglePage: + +class Topic(object): + def __init__(self, URL, name): + self.name = name + self.root = URL # root of the discussion + + def __unicode__(self): + return "%s: %s" % (self.root, self.name) + + +class GooglePage(object): verb_handler = urllib2.HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: verb_handler.set_http_debuglevel(2) @@ -42,9 +54,48 @@ class GooglePage: raise urllib2.HTTPError('Unknown URL: {}'.format(URL)) def get_first_page_BS(self, URL): - with self.opener.open(self.do_redirect(URL)) as esc_res: - return BeautifulSoup(esc_res.read()) + res = self.opener.open(self.do_redirect(URL)) + in_str = res.read() + bs = BeautifulSoup(in_str) + res.close() + return bs + + def get_count_topics(self, BS): + '''Get total number of topics from the number on the page + itself. + + Which would be awesome for control, except it is wrong on all + pages in various and different ways. :( + ''' + i_elem = BS.find_all('i') + if len(i_elem) <= 0: + raise ValueError('Cannot find count of topics!') + + i_str = i_elem[0].string + return int(TOPIC_COUNT_RE.match(i_str).group(1)) def get_topics(self, BS): - 'Recursively[?] get all topic (as special objects)' - return [] + '''Recursively[?] get all topic (as special objects) + Also return (for error checking) number of topics from the head + of the topic page. + ''' + out = [] + other = [] + for a_elem in BS.find_all('a'): + if 'title' in a_elem.attrs: + # filter out all-non-topic <a>s + logging.debug('href = %s', a_elem['href']) + logging.debug('title = %s', a_elem['title']) + out.append(Topic(a_elem['href'], a_elem['title'])) + else: + logging.debug('other = %s', a_elem) + other.append(a_elem) + + if len(other) == 1: + new_bs = BeautifulSoup(self.opener.open(other[0]['href']).read()) + out.extend(self.get_topics(new_bs)) + elif len(other) != 0: + raise ValueError( + 'There must be either one or none link to the next page!') + + return out |