Collecting topics.

Added also some testing pages.
author: Matěj Cepl <mcepl@redhat.com> 2013-12-27 03:42:30 +0100
committer: Matěj Cepl <mcepl@redhat.com> 2013-12-28 00:04:17 +0100
commit: 9dcfa6e10d512cb767dac9c410c96072f7cbd166 (patch)
tree: 38b59a4d256a23f7aef31122b45283b420cb21cd /gg_scrapper.py
parent: b33d37e962697141043e3c6d71417a525808d405 (diff)
download: gg_scraper-9dcfa6e10d512cb767dac9c410c96072f7cbd166.tar.gz
1 files changed, 56 insertions, 5 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index 59b81bc..650810c 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,13 +1,25 @@
 #!/usr/bin/python
 
+import re
 import urllib2
 from bs4 import BeautifulSoup
 import logging
 logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                     level=logging.DEBUG)
 
+TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
 
-class GooglePage:
+
+class Topic(object):
+    def __init__(self, URL, name):
+        self.name = name
+        self.root = URL  # root of the discussion
+
+    def __unicode__(self):
+        return "%s: %s" % (self.root, self.name)
+
+
+class GooglePage(object):
     verb_handler = urllib2.HTTPHandler()
     if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
         verb_handler.set_http_debuglevel(2)
@@ -42,9 +54,48 @@ class GooglePage:
             raise urllib2.HTTPError('Unknown URL: {}'.format(URL))
 
     def get_first_page_BS(self, URL):
-        with self.opener.open(self.do_redirect(URL)) as esc_res:
-            return BeautifulSoup(esc_res.read())
+        res = self.opener.open(self.do_redirect(URL))
+        in_str = res.read()
+        bs = BeautifulSoup(in_str)
+        res.close()
+        return bs
+
+    def get_count_topics(self, BS):
+        '''Get total number of topics from the number on the page
+        itself.
+
+        Which would be awesome for control, except it is wrong on all
+        pages in various and different ways. :(
+        '''
+        i_elem = BS.find_all('i')
+        if len(i_elem) <= 0:
+            raise ValueError('Cannot find count of topics!')
+
+        i_str = i_elem[0].string
+        return int(TOPIC_COUNT_RE.match(i_str).group(1))
 
     def get_topics(self, BS):
-        'Recursively[?] get all topic (as special objects)'
-        return []
+        '''Recursively[?] get all topic (as special objects)
+        Also return (for error checking) number of topics from the head
+        of the topic page.
+        '''
+        out = []
+        other = []
+        for a_elem in BS.find_all('a'):
+            if 'title' in a_elem.attrs:
+                # filter out all-non-topic <a>s
+                logging.debug('href = %s', a_elem['href'])
+                logging.debug('title = %s', a_elem['title'])
+                out.append(Topic(a_elem['href'], a_elem['title']))
+            else:
+                logging.debug('other = %s', a_elem)
+                other.append(a_elem)
+
+        if len(other) == 1:
+            new_bs = BeautifulSoup(self.opener.open(other[0]['href']).read())
+            out.extend(self.get_topics(new_bs))
+        elif len(other) != 0:
+            raise ValueError(
+                'There must be either one or none link to the next page!')
+
+        return out
author	Matěj Cepl <mcepl@redhat.com>	2013-12-27 03:42:30 +0100
committer	Matěj Cepl <mcepl@redhat.com>	2013-12-28 00:04:17 +0100
commit	9dcfa6e10d512cb767dac9c410c96072f7cbd166 (patch)
tree	38b59a4d256a23f7aef31122b45283b420cb21cd /gg_scrapper.py
parent	b33d37e962697141043e3c6d71417a525808d405 (diff)
download	gg_scraper-9dcfa6e10d512cb767dac9c410c96072f7cbd166.tar.gz