aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2013-12-30 01:11:17 +0100
committerMatěj Cepl <mcepl@redhat.com>2013-12-30 01:11:17 +0100
commit6ce68fd69aa0403766bac31c85be6bb4a3a286cc (patch)
tree2e309158b5d86c5689ed99d04024eeced4367f89
parentdfbb929e6b9985810646a19850268c382820791e (diff)
downloadgg_scraper-6ce68fd69aa0403766bac31c85be6bb4a3a286cc.tar.gz
Collect raw article
-rwxr-xr-xgg_scrapper.py14
-rw-r--r--test/message.eml95
-rw-r--r--test/test_functional.py17
3 files changed, 125 insertions, 1 deletions
diff --git a/gg_scrapper.py b/gg_scrapper.py
index f516892..13da91d 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,6 +1,7 @@
#!/usr/bin/python3
import re
+import subprocess
import urllib.request
import urllib.error
import urllib.parse
@@ -60,7 +61,17 @@ class Page(object):
class Article(Page):
def __init__(self, URL):
super(Article, self).__init__()
- self.root = URL
+ self.root = URL.replace('#!msg/', 'message/raw?msg=')
+ self.raw_message = ''
+
+ def collect_message(self):
+ with self.opener.open(self.root) as res:
+ raw_msg = res.read()
+ proc = subprocess.Popen(['/usr/bin/formail'],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+ result = proc.communicate(raw_msg)[0]
+ return result.decode()
class Topic(Page):
@@ -68,6 +79,7 @@ class Topic(Page):
super(Topic, self).__init__()
self.name = name
self.root = self.do_redirect(URL)
+ self.articles = []
def __unicode__(self):
return "%s: %s" % (self.root, self.name)
diff --git a/test/message.eml b/test/message.eml
new file mode 100644
index 0000000..5466b6f
--- /dev/null
+++ b/test/message.eml
@@ -0,0 +1,95 @@
+Received: by 10.52.108.162 with SMTP id hl2mr2540708vdb.27.1304342042063;
+ Mon, 02 May 2011 06:14:02 -0700 (PDT)
+X-BeenThere: jbrout@googlegroups.com
+Received: by 10.52.91.165 with SMTP id cf5ls1513548vdb.1.gmail; Mon, 02 May
+ 2011 06:14:01 -0700 (PDT)
+Received: by 10.52.115.164 with SMTP id jp4mr2877373vdb.20.1304342041446;
+ Mon, 02 May 2011 06:14:01 -0700 (PDT)
+Received: by 10.52.115.164 with SMTP id jp4mr2877372vdb.20.1304342041432;
+ Mon, 02 May 2011 06:14:01 -0700 (PDT)
+Return-Path: <chartier...@gmail.com>
+Received: from mail-qy0-f174.google.com (mail-qy0-f174.google.com [209.85.216.174])
+ by gmr-mx.google.com with ESMTPS id dq1si27474vdb.4.2011.05.02.06.14.01
+ (version=TLSv1/SSLv3 cipher=OTHER);
+ Mon, 02 May 2011 06:14:01 -0700 (PDT)
+Received-SPF: pass (google.com: domain of chartier...@gmail.com designates 209.85.216.174 as permitted sender) client-ip=209.85.216.174;
+Authentication-Results: gmr-mx.google.com; spf=pass (google.com: domain of chartier...@gmail.com designates 209.85.216.174 as permitted sender) smtp.mail=chartier...@gmail.com; dkim=pass (test mode) head...@gmail.com
+Received: by qyk7 with SMTP id 7so1309128qyk.19
+ for <jbr...@googlegroups.com>; Mon, 02 May 2011 06:14:01 -0700 (PDT)
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
+ d=gmail.com; s=gamma;
+ h=domainkey-signature:mime-version:date:message-id:subject:from:to
+ :content-type;
+ bh=Nalzo4HWmw8csZHQI1EnJTSkR0XU4A34QoJmVmekz6U=;
+ b=M7NzjgATs/y0xZ12nA21o5uWlaqi8gGrd4giou2t4kDTDQ2YNaBf2ziHdrWz8UbkQ5
+ dLTYaYVWCndQMukVsIZiC7d+vOGvW/+gigEDT3Fe5yyugnJmmvsnBA5KBF6S0wejNZJo
+ ifmkRWW8eTTcMeRavBsy9l8XIkPSxxKY29aPA=
+DomainKey-Signature: a=rsa-sha1; c=nofws;
+ d=gmail.com; s=gamma;
+ h=mime-version:date:message-id:subject:from:to:content-type;
+ b=V18crP15nGTVRRJzAzpgXM79+GV+wo4V4sbl+wDdtS0FZ84QT3XeUGPgn8N66v3FZa
+ rJDmZbR1TfHzhh42xAuL0o9pgkNqUkjEnsHdCi5WMKckCE1azmyvCeV4e7rgE3+dfAVM
+ attd9s30gSz4l7D1sBR+CWyPNovlHgJ4rFcNU=
+MIME-Version: 1.0
+Received: by 10.229.66.25 with SMTP id l25mr6066398qci.265.1304342041056; Mon,
+ 02 May 2011 06:14:01 -0700 (PDT)
+Received: by 10.229.34.145 with HTTP; Mon, 2 May 2011 06:14:01 -0700 (PDT)
+Date: Mon, 2 May 2011 15:14:01 +0200
+Message-ID: <BANLkTikms4MWTtUQE+pJEr2oAro_1FCfLg@mail.gmail.com>
+Subject: repo version incompatible with ubuntu 11.04 ?
+From: Francois Chartier <chartier...@gmail.com>
+To: jbrout@googlegroups.com
+Content-Type: multipart/alternative; boundary=0016e64ea45405d6cf04a24acc7a
+
+
+--0016e64ea45405d6cf04a24acc7a
+Content-Type: text/plain; charset=ISO-8859-1
+Content-Transfer-Encoding: quoted-printable
+
+Read today on http://doc.ubuntu-fr.org/jbrout : apparently, the repository
+version of jbrout has some compatibility issue with ubuntu 11.04:
+(But I haven't tried 11.04 myself yet, and am using SVN version, so... I'm
+just relaying it for info):
+
+29/04/2011 - Ubuntu 11.04 (natty) : la version jBrout pr=E9sente dans le d=
+=E9p=F4t
+externe (0.3.284) n'est pas compatible avec Ubuntu 11.04. Message d'erreur =
+:
+
+
+ Photo has incorrect exif/iptc tags, can't be imported
+
+
+
+The author of the page also complains that jbrout uses 100% of his CPU...
+
+
+Best regards
+Francois
+
+--0016e64ea45405d6cf04a24acc7a
+Content-Type: text/html; charset=ISO-8859-1
+Content-Transfer-Encoding: quoted-printable
+
+Read today on <a href=3D"http://doc.ubuntu-fr.org/jbrout">http://doc.ubuntu=
+-fr.org/jbrout</a> : apparently, the repository version of jbrout has some =
+compatibility issue with ubuntu 11.04: <br>(But I haven&#39;t tried 11.04 m=
+yself yet, and am using SVN version, so... I&#39;m just relaying it for inf=
+o): <br>
+<br><p>
+
+29/04/2011 - Ubuntu 11.04 (natty) : la version jBrout pr=E9sente dans le=20
+d=E9p=F4t externe (0.3.284) n&#39;est pas compatible avec Ubuntu 11.04.
+Message d&#39;erreur :
+
+</p>
+<pre class=3D"code"> Photo has incorrect exif/iptc tags, can&#39;t be im=
+ported<br><br><br><br></pre>The author of the page also complains that jbro=
+ut uses 100% of his CPU... <br><br><br>Best regards <br>Francois<br>
+<br>
+<p>
+
+<br></p>
+
+--0016e64ea45405d6cf04a24acc7a--
+
diff --git a/test/test_functional.py b/test/test_functional.py
index cb5f2e0..3f657f6 100644
--- a/test/test_functional.py
+++ b/test/test_functional.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import logging
+import os.path
import unittest
import gg_scrapper
@@ -9,6 +10,8 @@ ORIG_URL = 'http://groups.google.com/d/forum/jbrout'
EXP_URL = 'https://groups.google.com/forum/' + \
'?_escaped_fragment_=forum/jbrout'
TOPIC_URL = 'https://groups.google.com/forum/#!topic/jbrout/xNwoVmC07KI'
+ARTICLE_URL = 'https://groups.google.com/forum/#!msg/jbrout' + \
+ '/xNwoVmC07KI/OfpRHFscUkwJ'
class TestGGScrapperFunctional(unittest.TestCase):
@@ -29,5 +32,19 @@ class TestGGScrapperFunctional(unittest.TestCase):
logging.debug('articles = len {0:d}'.format(len(articles)))
self.assertEqual(len(articles), article_count)
+ def test_get_raw_article(self):
+ self.maxDiff = None
+ logging.debug('article = URL {}'.format(ARTICLE_URL))
+ article = gg_scrapper.Article(ARTICLE_URL)
+ logging.debug('article = raw URL {}'.format(article.root))
+
+ rfc_msg = article.collect_message().replace('\r\n', '\n')
+ rfc_msg = '\n'.join(rfc_msg.split('\n')[1:])
+
+ exp_file_name = os.path.join(os.path.dirname(__file__), 'message.eml')
+ with open(exp_file_name, 'r', encoding='utf8') as exp_f:
+ self.assertEqual(rfc_msg, exp_f.read())
+
+
if __name__ == '__main__':
unittest.main()