diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rwxr-xr-x | gg_scraper.py | 25 | ||||
-rw-r--r-- | setup.py | 29 | ||||
-rw-r--r-- | test/test_functional.py | 5 |
4 files changed, 25 insertions, 35 deletions
@@ -4,3 +4,4 @@ MANIFEST dist/ build/ +gg_scraper.egg-info/ diff --git a/gg_scraper.py b/gg_scraper.py index b1c3260..89939bf 100755 --- a/gg_scraper.py +++ b/gg_scraper.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python # -*- coding: utf-8 -*- """ Download a Google Group to MBOX @@ -54,13 +54,17 @@ MANGLED_ADDR_RE = re.compile( r'([a-zA-Z0-9_.+-]+(\.)+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', re.IGNORECASE) -__version__ = '0.8' +__version__ = '0.9.0' pyver = sys.version_info py26 = pyver[:2] < (2, 7) py3k = pyver[0] == 3 +class BadURLError(ValueError): + pass + + class Page(object): verb_handler = HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: @@ -172,14 +176,18 @@ class Topic(Page): class Group(Page): + GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)') + def __init__(self, URL): super(Group, self).__init__() self.group_URL = URL self.topics = [] - match = re.match(r'https://groups.google.com/forum/#!forum/(.+)', - URL) - if match is not None: - self.name = match.group(1) + match = self.GOOD_URL_RE.match(URL) + logging.debug('match = %s', match) + if match is None: + raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'") + + self.name = match.group(1) @staticmethod def get_count_topics(BS): @@ -305,10 +313,7 @@ class MBOX(mailbox.mbox): self.lock() for mbx_str in group_object.all_messages(): try: - if not py26: - self.add(mbx_str.encode()) - else: - self.add(mbx_str.encode('utf8')) + self.add(mbx_str.encode('utf8')) except UnicodeDecodeError: logging.warning('mbx_str = type {0}'.format(type(mbx_str))) self.unlock() @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, unicode_literals import sys -from distutils.core import setup, Command +from setuptools import setup import io try: import unittest2 as unittest @@ -11,26 +11,6 @@ except ImportError: import gg_scraper -class RunTests(Command): - """New setup.py command to run all tests for the package. - """ - description = "run all tests for the package" - - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - tests = unittest.TestLoader().discover('test') - runner = unittest.TextTestRunner(verbosity=2) - res = runner.run(tests) - sys.exit(int(not res.wasSuccessful())) - - classifiers = [ 'Development Status :: 3 - Alpha', 'Operating System :: OS Independent', @@ -58,7 +38,6 @@ setup(name='gg_scraper', keywords=['email', 'Google Groups', 'scrap', 'backup'], license='GNU GPL', classifiers=classifiers, - cmdclass={ - 'test': RunTests, - }, - requires=['beautifulsoup4', 'PyYAML']) + test_suite="test", + requires=['beautifulsoup4', 'PyYAML'] +) diff --git a/test/test_functional.py b/test/test_functional.py index 02ccc6d..34b49c6 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -10,6 +10,7 @@ except ImportError: import gg_scraper IN_URL = 'https://groups.google.com/forum/#!forum/jbrout' +BAD_URL = "http://groups.google.com/group/rdflib-dev" ORIG_URL = 'http://groups.google.com/d/forum/jbrout' EXP_URL = 'https://groups.google.com/forum/' + \ '?_escaped_fragment_=forum/jbrout' @@ -50,6 +51,10 @@ class TestGGScrapperFunctional(unittest.TestCase): article_count = topic.get_count_articles() self.assertEqual(len(articles), article_count) + def test_wrong_URL(self): + with self.assertRaises(gg_scraper.BadURLError): + gg_scraper.Group(BAD_URL) + def test_get_raw_article(self): self.maxDiff = None article = gg_scraper.Article(ARTICLE_URL) |