diff options
-rwxr-xr-x | gg_scrapper.py | 61 | ||||
-rw-r--r-- | test/__init__.py | 0 | ||||
-rw-r--r-- | test/group.html (renamed from dump.html) | 0 | ||||
-rw-r--r-- | test/test_functional.py | 33 | ||||
-rw-r--r-- | test/test_unit.py (renamed from test_gg_scrapper.py) | 9 |
5 files changed, 83 insertions, 20 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py index 37f7a13..f516892 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -4,12 +4,15 @@ import re import urllib.request import urllib.error import urllib.parse +#from concurrent.futures import ProcessPoolExecutor from bs4 import BeautifulSoup import logging logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$') +ARTICL_MSG_URL_RE = re.compile(r'https://groups.google.com/d/msg/') +ARTICLE_COUNT_RE = re.compile(r'\D+ \d+\D+\d+ \D+ (\d+) \D+$') class Page(object): @@ -55,22 +58,49 @@ class Page(object): class Article(Page): - def __init__(self): + def __init__(self, URL): super(Article, self).__init__() + self.root = URL class Topic(Page): def __init__(self, URL, name): super(Topic, self).__init__() self.name = name - self.root = URL + self.root = self.do_redirect(URL) def __unicode__(self): return "%s: %s" % (self.root, self.name) + @staticmethod + def get_one_article(elem): + return elem + + def get_count_articles(self): + '''Get total number of articles from the number on the page + itself. + ''' + BS = self._get_page_BS(self.root) + i_elem = BS.find_all('i') + if len(i_elem) <= 0: + raise ValueError('Cannot find count of topics!') + + i_str = i_elem[0].string + logging.debug('i_str = {}'.format(i_str)) + logging.debug('RE = {}'.format(ARTICLE_COUNT_RE.pattern)) + return int(ARTICLE_COUNT_RE.match(i_str).group(1)) + def get_articles(self): + out = [] page = self._get_page_BS(self.root) - page = page + for a_elem in page.find_all('a'): + if 'href' in a_elem.attrs: + a_href = a_elem['href'] + if ARTICL_MSG_URL_RE.match(a_href) is not None: + logging.debug('a_elem = %s', a_href) + out.append(Article(a_href)) + + return out class Group(Page): @@ -78,7 +108,8 @@ class Group(Page): super(Group, self).__init__() self.group_URL = URL - def get_count_topics(self, BS): + @staticmethod + def get_count_topics(BS): '''Get total number of topics from the number on the page itself. @@ -92,6 +123,17 @@ class Group(Page): i_str = i_elem[0].string return int(TOPIC_COUNT_RE.match(i_str).group(1)) + @staticmethod + def get_one_topic(elem): + if 'title' in elem.attrs: + # filter out all-non-topic <a>s + logging.debug('href = %s', elem['href']) + logging.debug('title = %s', elem['title']) + return True, Topic(elem['href'], elem['title']) + else: + logging.debug('other = %s', elem) + return False, elem + def get_topics(self): '''Recursively[?] get all topic (as special objects) Also return (for error checking) number of topics from the head @@ -101,14 +143,11 @@ class Group(Page): other = [] BS = self._get_page_BS(self.group_URL) for a_elem in BS.find_all('a'): - if 'title' in a_elem.attrs: - # filter out all-non-topic <a>s - logging.debug('href = %s', a_elem['href']) - logging.debug('title = %s', a_elem['title']) - out.append(Topic(a_elem['href'], a_elem['title'])) + is_topic, res = self.get_one_topic(a_elem) + if is_topic: + out.append(res) else: - logging.debug('other = %s', a_elem) - other.append(a_elem) + other.append(res) if len(other) == 1: new_bs = Group(other[0]['href']) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/__init__.py diff --git a/dump.html b/test/group.html index 8753299..8753299 100644 --- a/dump.html +++ b/test/group.html diff --git a/test/test_functional.py b/test/test_functional.py new file mode 100644 index 0000000..cb5f2e0 --- /dev/null +++ b/test/test_functional.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- + +import logging +import unittest +import gg_scrapper + +IN_URL = 'https://groups.google.com/forum/#!forum/jbrout' +ORIG_URL = 'http://groups.google.com/d/forum/jbrout' +EXP_URL = 'https://groups.google.com/forum/' + \ + '?_escaped_fragment_=forum/jbrout' +TOPIC_URL = 'https://groups.google.com/forum/#!topic/jbrout/xNwoVmC07KI' + + +class TestGGScrapperFunctional(unittest.TestCase): + def test_collecting_topics(self): + page = gg_scrapper.Group(IN_URL) + topics = page.get_topics() + logging.debug("number of topics = %d", len(topics)) + self.assertGreater(len(topics), 0) + + def test_collecting_articles(self): + logging.debug('topic = URL {}'.format(TOPIC_URL)) + topic = gg_scrapper.Topic(TOPIC_URL, + 'repo version incompatible with ' + + 'ubuntu 11.04 ?') + articles = topic.get_articles() + article_count = topic.get_count_articles() + logging.debug('article_count = {0:d}'.format(article_count)) + logging.debug('articles = len {0:d}'.format(len(articles))) + self.assertEqual(len(articles), article_count) + +if __name__ == '__main__': + unittest.main() diff --git a/test_gg_scrapper.py b/test/test_unit.py index 10e7f4a..4674529 100644 --- a/test_gg_scrapper.py +++ b/test/test_unit.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- - -import logging import unittest import gg_scrapper @@ -19,11 +16,5 @@ class TestGGScrapper(unittest.TestCase): obs_URL = gg_scrapper.Group.do_redirect(ORIG_URL) self.assertEqual(obs_URL, EXP_URL) - def test_collecting_topics(self): - page = gg_scrapper.Group(IN_URL) - topics = page.get_topics() - logging.debug("number of topics = %d", len(topics)) - self.assertGreater(len(topics), 0) - if __name__ == '__main__': unittest.main() |