Convert to python3.

author: Matěj Cepl <mcepl@redhat.com> 2013-12-28 00:27:09 +0100
committer: Matěj Cepl <mcepl@redhat.com> 2013-12-28 00:51:43 +0100
commit: 58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (patch)
tree: 02e70224d63e15ef137b967a55d8f24abe56bee1
parent: 9dcfa6e10d512cb767dac9c410c96072f7cbd166 (diff)
download: gg_scraper-58ec4876d8dd638de14c2ccb3d959c40eadfe2d8.tar.gz
4 files changed, 50 insertions, 44 deletions
diff --git a/bs_test.py b/bs_test.py
index 5b6a840..58b7529 100644..100755
--- a/bs_test.py
+++ b/bs_test.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 from bs4 import BeautifulSoup
 import sys
@@ -9,6 +9,6 @@ TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
 bs = BeautifulSoup(open(sys.argv[1]))
 i_str = bs.find_all('i')[0].string
 
-print("i = %s" % i_str)
+print("i = {}".format(i_str))
 count = int(TOPIC_COUNT_RE.match(i_str).group(1))
-print("match i = %d" % count)
+print("match i = {0:d}".format(count))
diff --git a/gg_scrapper.py b/gg_scrapper.py
index 650810c..37f7a13 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,7 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 import re
-import urllib2
+import urllib.request
+import urllib.error
+import urllib.parse
 from bs4 import BeautifulSoup
 import logging
 logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
@@ -10,24 +12,15 @@ logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
 TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
 
 
-class Topic(object):
-    def __init__(self, URL, name):
-        self.name = name
-        self.root = URL  # root of the discussion
-
-    def __unicode__(self):
-        return "%s: %s" % (self.root, self.name)
-
-
-class GooglePage(object):
-    verb_handler = urllib2.HTTPHandler()
+class Page(object):
+    verb_handler = urllib.request.HTTPHandler()
     if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
         verb_handler.set_http_debuglevel(2)
-    redir_handler = urllib2.HTTPRedirectHandler()
-    opener = urllib2.build_opener(verb_handler, redir_handler)
+    redir_handler = urllib.request.HTTPRedirectHandler()
+    opener = urllib.request.build_opener(verb_handler, redir_handler)
 
-    def __init__(self, URL):
-        self.bs_page = self.get_first_page_BS(URL)
+    def __init__(self):
+        pass
 
     @staticmethod
     def unenscape_Google_bang_URL(old_URL):
@@ -51,15 +44,40 @@ class GooglePage(object):
             logging.debug('url = {}'.format(new_URL))
             return cls.unenscape_Google_bang_URL(new_URL)
         else:
-            raise urllib2.HTTPError('Unknown URL: {}'.format(URL))
+            raise urllib.error.HTTPError('Unknown URL: {}'.format(URL))
 
-    def get_first_page_BS(self, URL):
+    def _get_page_BS(self, URL):
         res = self.opener.open(self.do_redirect(URL))
         in_str = res.read()
         bs = BeautifulSoup(in_str)
         res.close()
         return bs
 
+
+class Article(Page):
+    def __init__(self):
+        super(Article, self).__init__()
+
+
+class Topic(Page):
+    def __init__(self, URL, name):
+        super(Topic, self).__init__()
+        self.name = name
+        self.root = URL
+
+    def __unicode__(self):
+        return "%s: %s" % (self.root, self.name)
+
+    def get_articles(self):
+        page = self._get_page_BS(self.root)
+        page = page
+
+
+class Group(Page):
+    def __init__(self, URL):
+        super(Group, self).__init__()
+        self.group_URL = URL
+
     def get_count_topics(self, BS):
         '''Get total number of topics from the number on the page
         itself.
@@ -74,13 +92,14 @@ class GooglePage(object):
         i_str = i_elem[0].string
         return int(TOPIC_COUNT_RE.match(i_str).group(1))
 
-    def get_topics(self, BS):
+    def get_topics(self):
         '''Recursively[?] get all topic (as special objects)
         Also return (for error checking) number of topics from the head
         of the topic page.
         '''
         out = []
         other = []
+        BS = self._get_page_BS(self.group_URL)
         for a_elem in BS.find_all('a'):
             if 'title' in a_elem.attrs:
                 # filter out all-non-topic <a>s
@@ -92,8 +111,8 @@ class GooglePage(object):
                 other.append(a_elem)
 
         if len(other) == 1:
-            new_bs = BeautifulSoup(self.opener.open(other[0]['href']).read())
-            out.extend(self.get_topics(new_bs))
+            new_bs = Group(other[0]['href'])
+            out.extend(new_bs.get_topics())
         elif len(other) != 0:
             raise ValueError(
                 'There must be either one or none link to the next page!')
diff --git a/test-log.txt b/test-log.txt
index 56a123b..980cfce 100644
--- a/test-log.txt
+++ b/test-log.txt
@@ -1128,19 +1128,6 @@ DEBUG:unenscape_Google_bang_URL:esc_URL = https://groups.google.com/forum/?_esca
 ok
 
 ----------------------------------------------------------------------
-Ran 3 tests in 36.116s
+Ran 3 tests in 31.608s
 
 OK
-send: u'GET /d/forum/jbrout HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: groups.google.com\r\nConnection: close\r\nUser-Agent: Python-urllib/2.7\r\n\r\n'
-reply: 'HTTP/1.1 302 Moved Temporarily\r\n'
-header: Location: https://groups.google.com/d/forum/jbrout
-header: Content-Type: text/html; charset=UTF-8
-header: Date: Fri, 27 Dec 2013 03:31:46 GMT
-header: Expires: Fri, 27 Dec 2013 03:31:46 GMT
-header: Cache-Control: private, max-age=0
-header: X-Content-Type-Options: nosniff
-header: X-Frame-Options: SAMEORIGIN
-header: X-XSS-Protection: 1; mode=block
-header: Server: GSE
-header: Alternate-Protocol: 80:quic
-header: Connection: close
diff --git a/test_gg_scrapper.py b/test_gg_scrapper.py
index 1a75aab..10e7f4a 100644
--- a/test_gg_scrapper.py
+++ b/test_gg_scrapper.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals, print_function, absolute_import
+
 import logging
 import unittest
 import gg_scrapper
@@ -12,16 +12,16 @@ EXP_URL = 'https://groups.google.com/forum/' + \
 
 class TestGGScrapper(unittest.TestCase):
     def test_URL_conversion(self):
-        obs_URL = gg_scrapper.GooglePage.unenscape_Google_bang_URL(IN_URL)
+        obs_URL = gg_scrapper.Group.unenscape_Google_bang_URL(IN_URL)
         self.assertEqual(obs_URL, EXP_URL)
 
     def test_do_redirect(self):
-        obs_URL = gg_scrapper.GooglePage.do_redirect(ORIG_URL)
+        obs_URL = gg_scrapper.Group.do_redirect(ORIG_URL)
         self.assertEqual(obs_URL, EXP_URL)
 
     def test_collecting_topics(self):
-        page = gg_scrapper.GooglePage(IN_URL)
-        topics = page.get_topics(page.bs_page)
+        page = gg_scrapper.Group(IN_URL)
+        topics = page.get_topics()
         logging.debug("number of topics = %d", len(topics))
         self.assertGreater(len(topics), 0)
author	Matěj Cepl <mcepl@redhat.com>	2013-12-28 00:27:09 +0100
committer	Matěj Cepl <mcepl@redhat.com>	2013-12-28 00:51:43 +0100
commit	58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (patch)
tree	02e70224d63e15ef137b967a55d8f24abe56bee1
parent	9dcfa6e10d512cb767dac9c410c96072f7cbd166 (diff)
download	gg_scraper-58ec4876d8dd638de14c2ccb3d959c40eadfe2d8.tar.gz