aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rwxr-xr-xgg_scraper.py25
-rw-r--r--setup.py29
-rw-r--r--test/test_functional.py5
4 files changed, 25 insertions, 35 deletions
diff --git a/.gitignore b/.gitignore
index fb36334..22729ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
MANIFEST
dist/
build/
+gg_scraper.egg-info/
diff --git a/gg_scraper.py b/gg_scraper.py
index b1c3260..89939bf 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Download a Google Group to MBOX
@@ -54,13 +54,17 @@ MANGLED_ADDR_RE = re.compile(
r'([a-zA-Z0-9_.+-]+(\.)+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
re.IGNORECASE)
-__version__ = '0.8'
+__version__ = '0.9.0'
pyver = sys.version_info
py26 = pyver[:2] < (2, 7)
py3k = pyver[0] == 3
+class BadURLError(ValueError):
+ pass
+
+
class Page(object):
verb_handler = HTTPHandler()
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
@@ -172,14 +176,18 @@ class Topic(Page):
class Group(Page):
+ GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
+
def __init__(self, URL):
super(Group, self).__init__()
self.group_URL = URL
self.topics = []
- match = re.match(r'https://groups.google.com/forum/#!forum/(.+)',
- URL)
- if match is not None:
- self.name = match.group(1)
+ match = self.GOOD_URL_RE.match(URL)
+ logging.debug('match = %s', match)
+ if match is None:
+ raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'")
+
+ self.name = match.group(1)
@staticmethod
def get_count_topics(BS):
@@ -305,10 +313,7 @@ class MBOX(mailbox.mbox):
self.lock()
for mbx_str in group_object.all_messages():
try:
- if not py26:
- self.add(mbx_str.encode())
- else:
- self.add(mbx_str.encode('utf8'))
+ self.add(mbx_str.encode('utf8'))
except UnicodeDecodeError:
logging.warning('mbx_str = type {0}'.format(type(mbx_str)))
self.unlock()
diff --git a/setup.py b/setup.py
index e5379e1..af06596 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, unicode_literals
import sys
-from distutils.core import setup, Command
+from setuptools import setup
import io
try:
import unittest2 as unittest
@@ -11,26 +11,6 @@ except ImportError:
import gg_scraper
-class RunTests(Command):
- """New setup.py command to run all tests for the package.
- """
- description = "run all tests for the package"
-
- user_options = []
-
- def initialize_options(self):
- pass
-
- def finalize_options(self):
- pass
-
- def run(self):
- tests = unittest.TestLoader().discover('test')
- runner = unittest.TextTestRunner(verbosity=2)
- res = runner.run(tests)
- sys.exit(int(not res.wasSuccessful()))
-
-
classifiers = [
'Development Status :: 3 - Alpha',
'Operating System :: OS Independent',
@@ -58,7 +38,6 @@ setup(name='gg_scraper',
keywords=['email', 'Google Groups', 'scrap', 'backup'],
license='GNU GPL',
classifiers=classifiers,
- cmdclass={
- 'test': RunTests,
- },
- requires=['beautifulsoup4', 'PyYAML'])
+ test_suite="test",
+ requires=['beautifulsoup4', 'PyYAML']
+)
diff --git a/test/test_functional.py b/test/test_functional.py
index 02ccc6d..34b49c6 100644
--- a/test/test_functional.py
+++ b/test/test_functional.py
@@ -10,6 +10,7 @@ except ImportError:
import gg_scraper
IN_URL = 'https://groups.google.com/forum/#!forum/jbrout'
+BAD_URL = "http://groups.google.com/group/rdflib-dev"
ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
EXP_URL = 'https://groups.google.com/forum/' + \
'?_escaped_fragment_=forum/jbrout'
@@ -50,6 +51,10 @@ class TestGGScrapperFunctional(unittest.TestCase):
article_count = topic.get_count_articles()
self.assertEqual(len(articles), article_count)
+ def test_wrong_URL(self):
+ with self.assertRaises(gg_scraper.BadURLError):
+ gg_scraper.Group(BAD_URL)
+
def test_get_raw_article(self):
self.maxDiff = None
article = gg_scraper.Article(ARTICLE_URL)