diff options
author | Matěj Cepl <mcepl@redhat.com> | 2014-01-11 11:06:05 +0100 |
---|---|---|
committer | Matěj Cepl <mcepl@redhat.com> | 2014-01-11 11:31:46 +0100 |
commit | 2c734aeadf108363e570f59b4a69adbe14835171 (patch) | |
tree | f59903beca49ddd4515b8c00fe904eda73033643 | |
parent | 9a43d17636e6f77eb93d5c4d3653f36b3348851a (diff) | |
download | gg_scraper-0.6.tar.gz |
Eliminate do_redirect() method.0.6
One less HTTP connection, which was actually not needed.
-rwxr-xr-x | gg_scraper.py | 29 | ||||
-rw-r--r-- | test/test_unit.py | 4 |
2 files changed, 14 insertions, 19 deletions
diff --git a/gg_scraper.py b/gg_scraper.py index 5656fa4..563229f 100755 --- a/gg_scraper.py +++ b/gg_scraper.py @@ -54,7 +54,7 @@ MANGLED_ADDR_RE = re.compile( r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', re.IGNORECASE) -__version__ = '0.5' +__version__ = '0.6' if sys.version_info[:2] < (2, 7): py26 = True @@ -79,23 +79,22 @@ class Page(object): /ajax-crawling/docs/getting-started for more information """ if old_URL.find('#!') != -1: - esc_URL = old_URL.replace('#!', '?_escaped_fragment_=') - return esc_URL + return old_URL.replace('#!', '?_escaped_fragment_=') + elif old_URL.startswith('https://groups.google.com/d/topic/'): + # DEBUG:get_one_topic:URL collected = + # https://groups.google.com/d/topic/jbrout/dreCkob3KSs + # DEBUG:__init__:root_URL = + # https://groups.google.com/forum/\ + # ?_escaped_fragment_=topic/jbrout/dreCkob3KSs + return old_URL.replace( + 'https://groups.google.com/d/', + 'https://groups.google.com/forum/?_escaped_fragment_=' + ) else: return old_URL - @classmethod - def do_redirect(cls, URL): - res = cls.opener.open(URL) - - if res.getcode() == 200: - new_URL = res.geturl() - return cls.unenscape_Google_bang_URL(new_URL) - else: - raise HTTPError('Unknown URL: {0}'.format(URL)) - def _get_page_BS(self, URL): - res = self.opener.open(self.do_redirect(URL)) + res = self.opener.open(self.unenscape_Google_bang_URL(URL)) in_str = res.read() bs = BeautifulSoup(in_str) res.close() @@ -134,7 +133,7 @@ class Topic(Page): def __init__(self, URL, name): super(Topic, self).__init__() self.name = name - self.root = self.do_redirect(URL) + self.root = self.unenscape_Google_bang_URL(URL) self.articles = [] def __unicode__(self): diff --git a/test/test_unit.py b/test/test_unit.py index b286f97..e0fd0a4 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -20,10 +20,6 @@ class TestGGScrapper(unittest.TestCase): obs_URL = gg_scraper.Group.unenscape_Google_bang_URL(IN_URL) self.assertEqual(obs_URL, EXP_URL) - def test_do_redirect(self): - obs_URL = gg_scraper.Group.do_redirect(ORIG_URL) - self.assertEqual(obs_URL, EXP_URL) - class TestMBOX(unittest.TestCase): def setUp(self): |