aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2014-01-11 11:06:05 +0100
committerMatěj Cepl <mcepl@redhat.com>2014-01-11 11:31:46 +0100
commit2c734aeadf108363e570f59b4a69adbe14835171 (patch)
treef59903beca49ddd4515b8c00fe904eda73033643
parent9a43d17636e6f77eb93d5c4d3653f36b3348851a (diff)
downloadgg_scraper-0.6.tar.gz
Eliminate do_redirect() method.0.6
One less HTTP connection, which was actually not needed.
-rwxr-xr-xgg_scraper.py29
-rw-r--r--test/test_unit.py4
2 files changed, 14 insertions, 19 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index 5656fa4..563229f 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -54,7 +54,7 @@ MANGLED_ADDR_RE = re.compile(
r'([a-zA-Z0-9_.+-]+\.\.\.@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',
re.IGNORECASE)
-__version__ = '0.5'
+__version__ = '0.6'
if sys.version_info[:2] < (2, 7):
py26 = True
@@ -79,23 +79,22 @@ class Page(object):
/ajax-crawling/docs/getting-started for more information
"""
if old_URL.find('#!') != -1:
- esc_URL = old_URL.replace('#!', '?_escaped_fragment_=')
- return esc_URL
+ return old_URL.replace('#!', '?_escaped_fragment_=')
+ elif old_URL.startswith('https://groups.google.com/d/topic/'):
+ # DEBUG:get_one_topic:URL collected =
+ # https://groups.google.com/d/topic/jbrout/dreCkob3KSs
+ # DEBUG:__init__:root_URL =
+ # https://groups.google.com/forum/\
+ # ?_escaped_fragment_=topic/jbrout/dreCkob3KSs
+ return old_URL.replace(
+ 'https://groups.google.com/d/',
+ 'https://groups.google.com/forum/?_escaped_fragment_='
+ )
else:
return old_URL
- @classmethod
- def do_redirect(cls, URL):
- res = cls.opener.open(URL)
-
- if res.getcode() == 200:
- new_URL = res.geturl()
- return cls.unenscape_Google_bang_URL(new_URL)
- else:
- raise HTTPError('Unknown URL: {0}'.format(URL))
-
def _get_page_BS(self, URL):
- res = self.opener.open(self.do_redirect(URL))
+ res = self.opener.open(self.unenscape_Google_bang_URL(URL))
in_str = res.read()
bs = BeautifulSoup(in_str)
res.close()
@@ -134,7 +133,7 @@ class Topic(Page):
def __init__(self, URL, name):
super(Topic, self).__init__()
self.name = name
- self.root = self.do_redirect(URL)
+ self.root = self.unenscape_Google_bang_URL(URL)
self.articles = []
def __unicode__(self):
diff --git a/test/test_unit.py b/test/test_unit.py
index b286f97..e0fd0a4 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -20,10 +20,6 @@ class TestGGScrapper(unittest.TestCase):
obs_URL = gg_scraper.Group.unenscape_Google_bang_URL(IN_URL)
self.assertEqual(obs_URL, EXP_URL)
- def test_do_redirect(self):
- obs_URL = gg_scraper.Group.do_redirect(ORIG_URL)
- self.assertEqual(obs_URL, EXP_URL)
-
class TestMBOX(unittest.TestCase):
def setUp(self):