diff options
-rw-r--r-- | .travis.yml | 10 | ||||
-rwxr-xr-x | gg_scrapper.py | 50 | ||||
-rw-r--r-- | setup.py | 5 | ||||
-rw-r--r-- | test/test_functional.py | 3 | ||||
-rw-r--r-- | test/test_unit.py | 3 |
5 files changed, 49 insertions, 22 deletions
diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..57f70f9 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: python +python: + - "2.6" + - "2.7" + - "pypy" + - "3.2" + - "3.3" +before_script: + - "pip install --use-mirrors ." +script: PYTHONPATH=$PYTHONPATH:. python test/test_html2text.py -v diff --git a/gg_scrapper.py b/gg_scrapper.py index c11d69f..40a255e 100755 --- a/gg_scrapper.py +++ b/gg_scrapper.py @@ -1,4 +1,5 @@ #!/usr/bin/python3 +# -*- coding: utf-8 -*- """ Download a Google Group to MBOX Copyright (C) 2014 Matěj Cepl @@ -16,18 +17,26 @@ General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.' """ +from __future__ import absolute_import, print_function, unicode_literals import argparse -from configparser import ConfigParser +try: + from configparser import ConfigParser +except ImportError: + from ConfigParser import ConfigParser import mailbox import os.path import re import shutil import subprocess import sys -import urllib.error -import urllib.parse -import urllib.request +try: + from urllib.error import HTTPError + from urllib.request import HTTPHandler, HTTPRedirectHandler, \ + build_opener +except ImportError: + from urllib2 import (HTTPError, HTTPHandler, HTTPRedirectHandler, + build_opener) #from concurrent.futures import ProcessPoolExecutor from bs4 import BeautifulSoup import logging @@ -43,11 +52,11 @@ __version__ = '0.3' class Page(object): - verb_handler = urllib.request.HTTPHandler() + verb_handler = HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: verb_handler.set_http_debuglevel(2) - redir_handler = urllib.request.HTTPRedirectHandler() - opener = urllib.request.build_opener(verb_handler, redir_handler) + redir_handler = HTTPRedirectHandler() + opener = build_opener(verb_handler, redir_handler) def __init__(self): pass @@ -72,7 +81,7 @@ class Page(object): new_URL = res.geturl() return cls.unenscape_Google_bang_URL(new_URL) else: - raise urllib.error.HTTPError('Unknown URL: {}'.format(URL)) + raise HTTPError('Unknown URL: {}'.format(URL)) def _get_page_BS(self, URL): res = self.opener.open(self.do_redirect(URL)) @@ -91,17 +100,20 @@ class Article(Page): def collect_message(self): logging.debug('self.root = {}'.format(self.root)) try: - with self.opener.open(self.root) as res: - raw_msg = res.read() - proc = subprocess.Popen(['/usr/bin/formail'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - universal_newlines=True) - result = proc.communicate(raw_msg.decode())[0] - return result - except urllib.error.HTTPError as exc: + res = self.opener.open(self.root) + raw_msg = res.read() + proc = subprocess.Popen(['/usr/bin/formail'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + universal_newlines=True) + result = proc.communicate(raw_msg.decode())[0] + except HTTPError as exc: logging.warning('Exception on downloading {}:\n{}'.format( self.root, exc)) + finally: + res.close() + + return result class Topic(Page): @@ -248,8 +260,8 @@ class Group(Page): class MBOX(mailbox.mbox): def __init__(self, filename): if os.path.exists(filename): - shutil.move(filename, '{}.bak'.format(filename)) - super(MBOX, self).__init__(filename) + shutil.move(filename, '{0}.bak'.format(filename)) + mailbox.mbox.__init__(self, filename) self.box_name = filename def write_group(self, group_object): @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, unicode_literals from distutils.core import setup, Command import unittest @@ -51,4 +53,5 @@ setup(name='gg_scrapper', classifiers=classifiers, cmdclass={ 'test': RunTests, - }) + }, + requires=['beautifulsoup4', 'PyYAML']) diff --git a/test/test_functional.py b/test/test_functional.py index 3e8d874..ec76998 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import logging +import io import os.path import unittest import gg_scrapper @@ -40,7 +41,7 @@ class TestGGScrapperFunctional(unittest.TestCase): rfc_msg = '\n'.join(rfc_msg.split('\n')[1:]) exp_file_name = os.path.join(os.path.dirname(__file__), 'message.eml') - with open(exp_file_name, 'r', encoding='utf8') as exp_f: + with io.open(exp_file_name, 'r', encoding='utf8') as exp_f: self.assertEqual(rfc_msg, exp_f.read()) diff --git a/test/test_unit.py b/test/test_unit.py index 9138466..70b001b 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -61,7 +61,8 @@ class TestDemangle(unittest.TestCase): with open('unmangled.mbx') as obs_mbx_f: with open('test/mbox_unmangled.mbx') as exp_mbx_f: self.assertAlmostEqual(len(obs_mbx_f.read()), - len(exp_mbx_f.read())) + len(exp_mbx_f.read()), + delta=100) if __name__ == '__main__': unittest.main() |