diff options
author | Matěj Cepl <mcepl@redhat.com> | 2013-12-28 00:27:09 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@redhat.com> | 2013-12-28 00:51:43 +0100 |
commit | 58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (patch) | |
tree | 02e70224d63e15ef137b967a55d8f24abe56bee1 /gg_scrapper.py | |
parent | 9dcfa6e10d512cb767dac9c410c96072f7cbd166 (diff) | |
download | gg_scraper-58ec4876d8dd638de14c2ccb3d959c40eadfe2d8.tar.gz |
Convert to python3.
Diffstat (limited to 'gg_scrapper.py')
-rwxr-xr-x | gg_scrapper.py | 63 |
1 files changed, 41 insertions, 22 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py index 650810c..37f7a13 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -1,7 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import re -import urllib2 +import urllib.request +import urllib.error +import urllib.parse from bs4 import BeautifulSoup import logging logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', @@ -10,24 +12,15 @@ logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$') -class Topic(object): - def __init__(self, URL, name): - self.name = name - self.root = URL # root of the discussion - - def __unicode__(self): - return "%s: %s" % (self.root, self.name) - - -class GooglePage(object): - verb_handler = urllib2.HTTPHandler() +class Page(object): + verb_handler = urllib.request.HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: verb_handler.set_http_debuglevel(2) - redir_handler = urllib2.HTTPRedirectHandler() - opener = urllib2.build_opener(verb_handler, redir_handler) + redir_handler = urllib.request.HTTPRedirectHandler() + opener = urllib.request.build_opener(verb_handler, redir_handler) - def __init__(self, URL): - self.bs_page = self.get_first_page_BS(URL) + def __init__(self): + pass @staticmethod def unenscape_Google_bang_URL(old_URL): @@ -51,15 +44,40 @@ class GooglePage(object): logging.debug('url = {}'.format(new_URL)) return cls.unenscape_Google_bang_URL(new_URL) else: - raise urllib2.HTTPError('Unknown URL: {}'.format(URL)) + raise urllib.error.HTTPError('Unknown URL: {}'.format(URL)) - def get_first_page_BS(self, URL): + def _get_page_BS(self, URL): res = self.opener.open(self.do_redirect(URL)) in_str = res.read() bs = BeautifulSoup(in_str) res.close() return bs + +class Article(Page): + def __init__(self): + super(Article, self).__init__() + + +class Topic(Page): + def __init__(self, URL, name): + super(Topic, self).__init__() + self.name = name + self.root = URL + + def __unicode__(self): + return "%s: %s" % (self.root, self.name) + + def get_articles(self): + page = self._get_page_BS(self.root) + page = page + + +class Group(Page): + def __init__(self, URL): + super(Group, self).__init__() + self.group_URL = URL + def get_count_topics(self, BS): '''Get total number of topics from the number on the page itself. @@ -74,13 +92,14 @@ class GooglePage(object): i_str = i_elem[0].string return int(TOPIC_COUNT_RE.match(i_str).group(1)) - def get_topics(self, BS): + def get_topics(self): '''Recursively[?] get all topic (as special objects) Also return (for error checking) number of topics from the head of the topic page. ''' out = [] other = [] + BS = self._get_page_BS(self.group_URL) for a_elem in BS.find_all('a'): if 'title' in a_elem.attrs: # filter out all-non-topic <a>s @@ -92,8 +111,8 @@ class GooglePage(object): other.append(a_elem) if len(other) == 1: - new_bs = BeautifulSoup(self.opener.open(other[0]['href']).read()) - out.extend(self.get_topics(new_bs)) + new_bs = Group(other[0]['href']) + out.extend(new_bs.get_topics()) elif len(other) != 0: raise ValueError( 'There must be either one or none link to the next page!') |