Collect articles for one topic.

author: Matěj Cepl <mcepl@redhat.com> 2013-12-29 23:44:41 +0100
committer: Matěj Cepl <mcepl@redhat.com> 2013-12-29 23:44:41 +0100
commit: dfbb929e6b9985810646a19850268c382820791e (patch)
tree: 58568875e3ae71a0ba25dbe2a94945ad679cb957
parent: 58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (diff)
download: gg_scraper-dfbb929e6b9985810646a19850268c382820791e.tar.gz
5 files changed, 83 insertions, 20 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index 37f7a13..f516892 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -4,12 +4,15 @@ import re
 import urllib.request
 import urllib.error
 import urllib.parse
+#from concurrent.futures import ProcessPoolExecutor
 from bs4 import BeautifulSoup
 import logging
 logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                     level=logging.DEBUG)
 
 TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
+ARTICL_MSG_URL_RE = re.compile(r'https://groups.google.com/d/msg/')
+ARTICLE_COUNT_RE = re.compile(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$')
 
 
 class Page(object):
@@ -55,22 +58,49 @@ class Page(object):
 
 
 class Article(Page):
-    def __init__(self):
+    def __init__(self, URL):
         super(Article, self).__init__()
+        self.root = URL
 
 
 class Topic(Page):
     def __init__(self, URL, name):
         super(Topic, self).__init__()
         self.name = name
-        self.root = URL
+        self.root = self.do_redirect(URL)
 
     def __unicode__(self):
         return "%s: %s" % (self.root, self.name)
 
+    @staticmethod
+    def get_one_article(elem):
+        return elem
+
+    def get_count_articles(self):
+        '''Get total number of articles from the number on the page
+        itself.
+        '''
+        BS = self._get_page_BS(self.root)
+        i_elem = BS.find_all('i')
+        if len(i_elem) <= 0:
+            raise ValueError('Cannot find count of topics!')
+
+        i_str = i_elem[0].string
+        logging.debug('i_str = {}'.format(i_str))
+        logging.debug('RE = {}'.format(ARTICLE_COUNT_RE.pattern))
+        return int(ARTICLE_COUNT_RE.match(i_str).group(1))
+
     def get_articles(self):
+        out = []
         page = self._get_page_BS(self.root)
-        page = page
+        for a_elem in page.find_all('a'):
+            if 'href' in a_elem.attrs:
+                a_href = a_elem['href']
+                if ARTICL_MSG_URL_RE.match(a_href) is not None:
+                    logging.debug('a_elem = %s', a_href)
+                    out.append(Article(a_href))
+
+        return out
 
 
 class Group(Page):
@@ -78,7 +108,8 @@ class Group(Page):
         super(Group, self).__init__()
         self.group_URL = URL
 
-    def get_count_topics(self, BS):
+    @staticmethod
+    def get_count_topics(BS):
         '''Get total number of topics from the number on the page
         itself.
 
@@ -92,6 +123,17 @@ class Group(Page):
         i_str = i_elem[0].string
         return int(TOPIC_COUNT_RE.match(i_str).group(1))
 
+    @staticmethod
+    def get_one_topic(elem):
+        if 'title' in elem.attrs:
+            # filter out all-non-topic <a>s
+            logging.debug('href = %s', elem['href'])
+            logging.debug('title = %s', elem['title'])
+            return True, Topic(elem['href'], elem['title'])
+        else:
+            logging.debug('other = %s', elem)
+            return False, elem
+
     def get_topics(self):
         '''Recursively[?] get all topic (as special objects)
         Also return (for error checking) number of topics from the head
@@ -101,14 +143,11 @@ class Group(Page):
         other = []
         BS = self._get_page_BS(self.group_URL)
         for a_elem in BS.find_all('a'):
-            if 'title' in a_elem.attrs:
-                # filter out all-non-topic <a>s
-                logging.debug('href = %s', a_elem['href'])
-                logging.debug('title = %s', a_elem['title'])
-                out.append(Topic(a_elem['href'], a_elem['title']))
+            is_topic, res = self.get_one_topic(a_elem)
+            if is_topic:
+                out.append(res)
             else:
-                logging.debug('other = %s', a_elem)
-                other.append(a_elem)
+                other.append(res)
 
         if len(other) == 1:
             new_bs = Group(other[0]['href'])
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/__init__.py
diff --git a/dump.html b/test/group.html
index 8753299..8753299 100644
--- a/dump.html
+++ b/test/group.html
diff --git a/test/test_functional.py b/test/test_functional.py
new file mode 100644
index 0000000..cb5f2e0
--- /dev/null
+++ b/test/test_functional.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import unittest
+import gg_scrapper
+
+IN_URL = 'https://groups.google.com/forum/#!forum/jbrout'
+ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
+EXP_URL = 'https://groups.google.com/forum/' + \
+    '?_escaped_fragment_=forum/jbrout'
+TOPIC_URL = 'https://groups.google.com/forum/#!topic/jbrout/xNwoVmC07KI'
+
+
+class TestGGScrapperFunctional(unittest.TestCase):
+    def test_collecting_topics(self):
+        page = gg_scrapper.Group(IN_URL)
+        topics = page.get_topics()
+        logging.debug("number of topics = %d", len(topics))
+        self.assertGreater(len(topics), 0)
+
+    def test_collecting_articles(self):
+        logging.debug('topic = URL {}'.format(TOPIC_URL))
+        topic = gg_scrapper.Topic(TOPIC_URL,
+                                  'repo version incompatible with ' +
+                                  'ubuntu 11.04 ?')
+        articles = topic.get_articles()
+        article_count = topic.get_count_articles()
+        logging.debug('article_count = {0:d}'.format(article_count))
+        logging.debug('articles = len {0:d}'.format(len(articles)))
+        self.assertEqual(len(articles), article_count)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test_gg_scrapper.py b/test/test_unit.py
index 10e7f4a..4674529 100644
--- a/test_gg_scrapper.py
+++ b/test/test_unit.py
@@ -1,6 +1,3 @@
-# -*- coding: utf-8 -*-
-
-import logging
 import unittest
 import gg_scrapper
 
@@ -19,11 +16,5 @@ class TestGGScrapper(unittest.TestCase):
         obs_URL = gg_scrapper.Group.do_redirect(ORIG_URL)
         self.assertEqual(obs_URL, EXP_URL)
 
-    def test_collecting_topics(self):
-        page = gg_scrapper.Group(IN_URL)
-        topics = page.get_topics()
-        logging.debug("number of topics = %d", len(topics))
-        self.assertGreater(len(topics), 0)
-
 if __name__ == '__main__':
     unittest.main()
author	Matěj Cepl <mcepl@redhat.com>	2013-12-29 23:44:41 +0100
committer	Matěj Cepl <mcepl@redhat.com>	2013-12-29 23:44:41 +0100
commit	dfbb929e6b9985810646a19850268c382820791e (patch)
tree	58568875e3ae71a0ba25dbe2a94945ad679cb957
parent	58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (diff)
download	gg_scraper-dfbb929e6b9985810646a19850268c382820791e.tar.gz