aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.travis.yml3
-rw-r--r--README.rst4
-rwxr-xr-xgg_scraper.py76
-rw-r--r--setup.py5
-rw-r--r--test/test_functional.py7
-rw-r--r--test/test_unit.py9
6 files changed, 74 insertions, 30 deletions
diff --git a/.travis.yml b/.travis.yml
index 85d9c5c..58494ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,9 +1,12 @@
language: python
python:
+ - "2.6"
- "2.7"
- "pypy"
- "3.2"
- "3.3"
before_script:
+ - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors futures ordereddict unittest2 || /bin/true'
+ - "sudo apt-get install procmail"
- "pip install --use-mirrors beautifulsoup4 PyYAML"
script: python setup.py test
diff --git a/README.rst b/README.rst
index 0d7e98a..7727864 100644
--- a/README.rst
+++ b/README.rst
@@ -16,7 +16,9 @@ email (one of many of my addresses are available on my `Github page`_ )
.. _`Github page`:
https://github.com/mcepl
-Of course pull requests are more than welcome in the same places as well. Currently all development is done with Python 3.3, but tests are run on Travis-CI for 2.7 and pypy as well.
+Of course pull requests are more than welcome in the same places as
+well. Currently all development is done with Python 3.3, but tests are
+run on Travis-CI for 2.6, 2.7, and pypy as well.
.. image:: https://secure.travis-ci.org/mcepl/gg_scraper.png
:alt: Build Status
diff --git a/gg_scraper.py b/gg_scraper.py
index 556fbb9..5656fa4 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -20,7 +20,10 @@ with this program. If not, see <http://www.gnu.org/licenses/>.'
from __future__ import absolute_import, print_function, unicode_literals
import argparse
-from collections import OrderedDict
+try:
+ from collections import OrderedDict
+except ImportError:
+ from ordereddict import OrderedDict
import operator
try:
from configparser import ConfigParser
@@ -32,6 +35,7 @@ import re
import shutil
import subprocess
import sys
+import yaml
try:
from urllib.error import HTTPError
from urllib.request import HTTPHandler, HTTPRedirectHandler, \
@@ -43,7 +47,7 @@ except ImportError:
from bs4 import BeautifulSoup
import logging
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
- level=logging.INFO)
+ level=logging.DEBUG)
ADDR_SEC_LABEL = 'addresses'
MANGLED_ADDR_RE = re.compile(
@@ -52,6 +56,11 @@ MANGLED_ADDR_RE = re.compile(
__version__ = '0.5'
+if sys.version_info[:2] < (2, 7):
+ py26 = True
+else:
+ py26 = False
+
class Page(object):
verb_handler = HTTPHandler()
@@ -83,7 +92,7 @@ class Page(object):
new_URL = res.geturl()
return cls.unenscape_Google_bang_URL(new_URL)
else:
- raise HTTPError('Unknown URL: {}'.format(URL))
+ raise HTTPError('Unknown URL: {0}'.format(URL))
def _get_page_BS(self, URL):
res = self.opener.open(self.do_redirect(URL))
@@ -100,20 +109,23 @@ class Article(Page):
self.raw_message = ''
def collect_message(self):
- logging.debug('self.root = {}'.format(self.root))
+ logging.debug('self.root = {0}'.format(self.root))
+ result = None
try:
res = self.opener.open(self.root)
- raw_msg = res.read()
+ if not py26:
+ raw_msg = res.read().decode('utf8')
+ else:
+ raw_msg = res.read()
proc = subprocess.Popen(['/usr/bin/formail'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
universal_newlines=True)
- result = proc.communicate(raw_msg.decode())[0]
+ result = proc.communicate(raw_msg)[0]
+ res.close()
except HTTPError as exc:
- logging.warning('Exception on downloading {}:\n{}'.format(
+ logging.warning('Exception on downloading {0}:\n{1}'.format(
self.root, exc))
- finally:
- res.close()
return result
@@ -225,8 +237,9 @@ class Group(Page):
self.topics = self.get_topics()
len_topics = len(self.topics)
for top in self.topics:
- print('[%d/%d] downloading "%s"' % (self.topics.index(top),
- len_topics, top.name))
+ #print('[%d/%d] downloading "%s"' % (self.topics.index(top),
+ # len_topics, top.name))
+ print('[%d/%d] downloading' % (self.topics.index(top), len_topics))
arts = top.get_articles()
top.articles = arts
for a in arts:
@@ -258,8 +271,8 @@ class Group(Page):
key=operator.itemgetter(1),
reverse=True))
- with open('{}.cnf'.format(self.name), 'w') as cnf_f:
- cnf_p = ConfigParser()
+ with open('{0}.cnf'.format(self.name), 'w') as cnf_f:
+ cnf_p = ConfigParser(dict_type=OrderedDict)
cnf_p.add_section(ADDR_SEC_LABEL)
for addr in addrs:
cnf_p.set(ADDR_SEC_LABEL, addr, '')
@@ -273,26 +286,41 @@ class MBOX(mailbox.mbox):
mailbox.mbox.__init__(self, filename)
self.box_name = filename
+ lockfile = '{0}.lock'.format(filename)
+ if os.path.exists(lockfile):
+ os.unlink(lockfile)
+
def write_group(self, group_object):
self.lock()
for mbx_str in group_object.all_messages():
- self.add(mbx_str.encode())
+ try:
+ if not py26:
+ self.add(mbx_str.encode())
+ else:
+ self.add(mbx_str.encode('utf8'))
+ except UnicodeDecodeError:
+ logging.debug('mbx_str = type {0}'.format(type(mbx_str)))
self.unlock()
self.close()
def main(group_URL):
# Collect all messages to the internal variables
- grp = Group(group_URL)
- grp.collect_group()
+ if os.path.exists('group.yaml'):
+ with open('group.yaml') as yf:
+ logging.debug('Loading state from group.yaml')
+ grp = yaml.load(yf)
+ logging.debug('Done')
+ else:
+ grp = Group(group_URL)
+ grp.collect_group()
- #import yaml
- # dump the state for debugging
- #with open('group.yaml', 'w') as yf:
- # yaml.dump(grp, yf)
+ # dump the state for debugging
+ with open('group.yaml', 'w') as yf:
+ yaml.dump(grp, yf)
# Write MBOX
- mbx = MBOX("{}.mbx".format(grp.name))
+ mbx = MBOX("{0}.mbx".format(grp.name))
mbx.write_group(grp)
# generate list of addresses protected against spammers
@@ -300,12 +328,12 @@ def main(group_URL):
def demangle(correct_list, orig_mbx, out_mbx):
- cnf_p = ConfigParser()
+ cnf_p = ConfigParser(dict_type=OrderedDict)
cnf_p.read(correct_list)
pairs = dict(cnf_p.items(ADDR_SEC_LABEL))
if os.path.exists(out_mbx):
- shutil.move(out_mbx, '{}.bak'.format(out_mbx))
+ shutil.move(out_mbx, '{0}.bak'.format(out_mbx))
in_mbx = mailbox.mbox(orig_mbx)
out_mbx = mailbox.mbox(out_mbx)
@@ -341,7 +369,7 @@ if __name__ == '__main__':
'file.')
args = parser.parse_args()
- logging.debug('args = {}'.format(args))
+ logging.debug('args = {0}'.format(args))
if args.demangle is not None:
demangle(args.demangle[0], args.demangle[1], args.demangle[2])
diff --git a/setup.py b/setup.py
index ccaec04..331e57e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, unicode_literals
from distutils.core import setup, Command
-import unittest
+try:
+ import unittest2 as unittest
+except ImportError:
+ import unittest
import gg_scraper
diff --git a/test/test_functional.py b/test/test_functional.py
index c8f5bf2..2c3ad27 100644
--- a/test/test_functional.py
+++ b/test/test_functional.py
@@ -3,7 +3,10 @@
import logging
import io
import os.path
-import unittest
+try:
+ import unittest2 as unittest
+except ImportError:
+ import unittest
import gg_scraper
IN_URL = 'https://groups.google.com/forum/#!forum/jbrout'
@@ -23,7 +26,7 @@ class TestGGScrapperFunctional(unittest.TestCase):
self.assertGreater(len(topics), 0)
def test_collecting_articles(self):
- logging.debug('topic = URL {}'.format(TOPIC_URL))
+ logging.debug('topic = URL {0}'.format(TOPIC_URL))
topic = gg_scraper.Topic(TOPIC_URL,
'repo version incompatible with ' +
'ubuntu 11.04 ?')
diff --git a/test/test_unit.py b/test/test_unit.py
index 503aafe..b286f97 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -1,7 +1,11 @@
import os
import tempfile
import yaml
-import unittest
+import sys
+try:
+ import unittest2 as unittest
+except ImportError:
+ import unittest
import gg_scraper
from gg_scraper import Group, Topic, Article # noqa
@@ -27,6 +31,7 @@ class TestMBOX(unittest.TestCase):
with open(group_file_name, 'r') as group_f:
self.group = yaml.load(group_f)
+ @unittest.skipIf(sys.version_info[:2] < (2, 7), 'Formatting on 2.6 is different')
def test_create_mbox(self):
'''Create a mbox file from (YAMLed) Group
'''
@@ -47,7 +52,7 @@ class TestMBOX(unittest.TestCase):
self.group.collect_mangled_addrs()
- with open('{}.cnf'.format(self.group.name)) as obs_f:
+ with open('{0}.cnf'.format(self.group.name)) as obs_f:
mang_addres = obs_f.read()
self.assertEqual(exp_str, mang_addres)