aboutsummaryrefslogtreecommitdiffstats
path: root/gg_scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'gg_scraper.py')
-rwxr-xr-xgg_scraper.py25
1 files changed, 15 insertions, 10 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index b1c3260..89939bf 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Download a Google Group to MBOX
@@ -54,13 +54,17 @@ MANGLED_ADDR_RE = re.compile(
r'([a-zA-Z0-9_.+-]+(\.)+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
re.IGNORECASE)
-__version__ = '0.8'
+__version__ = '0.9.0'
pyver = sys.version_info
py26 = pyver[:2] < (2, 7)
py3k = pyver[0] == 3
+class BadURLError(ValueError):
+ pass
+
+
class Page(object):
verb_handler = HTTPHandler()
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
@@ -172,14 +176,18 @@ class Topic(Page):
class Group(Page):
+ GOOD_URL_RE = re.compile(r'https://groups.google.com/forum/#!forum/(.+)')
+
def __init__(self, URL):
super(Group, self).__init__()
self.group_URL = URL
self.topics = []
- match = re.match(r'https://groups.google.com/forum/#!forum/(.+)',
- URL)
- if match is not None:
- self.name = match.group(1)
+ match = self.GOOD_URL_RE.match(URL)
+ logging.debug('match = %s', match)
+ if match is None:
+ raise BadURLError("Required URL in form 'https://groups.google.com/forum/#!forum/GROUPNAME'")
+
+ self.name = match.group(1)
@staticmethod
def get_count_topics(BS):
@@ -305,10 +313,7 @@ class MBOX(mailbox.mbox):
self.lock()
for mbx_str in group_object.all_messages():
try:
- if not py26:
- self.add(mbx_str.encode())
- else:
- self.add(mbx_str.encode('utf8'))
+ self.add(mbx_str.encode('utf8'))
except UnicodeDecodeError:
logging.warning('mbx_str = type {0}'.format(type(mbx_str)))
self.unlock()