diff options
-rw-r--r-- | parse_received.py | 490 | ||||
-rw-r--r-- | test/samples/testing-email-01.eml | 129 | ||||
-rw-r--r-- | test/test_parse_received.py | 157 |
3 files changed, 776 insertions, 0 deletions
diff --git a/parse_received.py b/parse_received.py new file mode 100644 index 0000000..f9d106b --- /dev/null +++ b/parse_received.py @@ -0,0 +1,490 @@ +# -*- coding: utf-8 -*- +import dateutil.parser +import email +import email.utils +from pprint import pformat +import logging +logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', + level=logging.DEBUG) + +import rply + + +#logging.basicConfig(level=logging.DEBUG) + +# from http://search.cpan.org/~simon/Email-Received-1.00/lib/Email/Received.pm +# possible keys are +# ip rdns helo ident envfrom auth by id + +# from RFC2821 +""" +4.4 Trace Information + + + When an SMTP server receives a message for delivery or further + processing, it MUST insert trace ("time stamp" or "Received") + information at the beginning of the message content, as discussed in + section 4.1.1.4. + + This line MUST be structured as follows: + + - The FROM field, which MUST be supplied in an SMTP environment, + SHOULD contain both (1) the name of the source host as presented + in the EHLO command and (2) an address literal containing the IP + address of the source, determined from the TCP connection. + + - The ID field MAY contain an "@" as suggested in RFC 822, but this + is not required. + + - The FOR field MAY contain a list of <path> entries when multiple + RCPT commands have been given. This may raise some security + issues and is usually not desirable; see section 7.2. + + An Internet mail program MUST NOT change a Received: line that was + previously added to the message header. SMTP servers MUST prepend + Received lines to messages; they MUST NOT change the order of + existing lines or insert Received lines in any other location. + + As the Internet grows, comparability of Received fields is important + for detecting problems, especially slow relays. SMTP servers that + create Received fields SHOULD use explicit offsets in the dates + (e.g., -0800), rather than time zone names of any type. Local time + (with an offset) is preferred to UT when feasible. This formulation + allows slightly more information about local circumstances to be + specified. If UT is needed, the receiver need merely do some simple + arithmetic to convert the values. Use of UT loses information about + the time zone-location of the server. If it is desired to supply a + time zone name, it SHOULD be included in a comment. + + When the delivery SMTP server makes the "final delivery" of a + message, it inserts a return-path line at the beginning of the mail + data. This use of return-path is required; mail systems MUST support + it. The return-path line preserves the information in the <reverse- + path> from the MAIL command. Here, final delivery means the message + has left the SMTP environment. Normally, this would mean it had been + delivered to the destination user or an associated mail drop, but in + some cases it may be further processed and transmitted by another + mail system. + + It is possible for the mailbox in the return path to be different + from the actual sender's mailbox, for example, if error responses are + to be delivered to a special error handling mailbox rather than to + the message sender. When mailing lists are involved, this + arrangement is common and useful as a means of directing errors to + the list maintainer rather than the message originator. + + The text above implies that the final mail data will begin with a + return path line, followed by one or more time stamp lines. These + lines will be followed by the mail data headers and body [32]. + + It is sometimes difficult for an SMTP server to determine whether or + not it is making final delivery since forwarding or other operations + may occur after the message is accepted for delivery. Consequently, + any further (forwarding, gateway, or relay) systems MAY remove the + return path and rebuild the MAIL command as needed to ensure that + exactly one such line appears in a delivered message. + + A message-originating SMTP system SHOULD NOT send a message that + already contains a Return-path header. SMTP servers performing a + relay function MUST NOT inspect the message data, and especially not + to the extent needed to determine if Return-path headers are present. + SMTP servers making final delivery MAY remove Return-path headers + before adding their own. + + The primary purpose of the Return-path is to designate the address to + which messages indicating non-delivery or other mail system failures + are to be sent. For this to be unambiguous, exactly one return path + SHOULD be present when the message is delivered. Systems using RFC + 822 syntax with non-SMTP transports SHOULD designate an unambiguous + address, associated with the transport envelope, to which error + reports (e.g., non-delivery messages) should be sent. + + Historical note: Text in RFC 822 that appears to contradict the use + of the Return-path header (or the envelope reverse path address from + the MAIL command) as the destination for error messages is not + applicable on the Internet. The reverse path address (as copied into + the Return-path) MUST be used as the target of any mail containing + delivery error messages. + + In particular: + + - a gateway from SMTP->elsewhere SHOULD insert a return-path header, + unless it is known that the "elsewhere" transport also uses + Internet domain addresses and maintains the envelope sender + address separately. + + - a gateway from elsewhere->SMTP SHOULD delete any return-path + header present in the message, and either copy that information to + the SMTP envelope or combine it with information present in the + envelope of the other transport system to construct the reverse + path argument to the MAIL command in the SMTP envelope. + + The server must give special treatment to cases in which the + processing following the end of mail data indication is only + partially successful. This could happen if, after accepting several + recipients and the mail data, the SMTP server finds that the mail + data could be successfully delivered to some, but not all, of the + recipients. In such cases, the response to the DATA command MUST be + an OK reply. However, the SMTP server MUST compose and send an + "undeliverable mail" notification message to the originator of the + message. + + A single notification listing all of the failed recipients or + separate notification messages MUST be sent for each failed + recipient. For economy of processing by the sender, the former is + preferred when possible. All undeliverable mail notification + messages are sent using the MAIL command (even if they result from + processing the obsolete SEND, SOML, or SAML commands) and use a null + return path as discussed in section 3.7. + + The time stamp line and the return path line are formally defined as + follows: +""" + +# from + +""" +3.1.5. Path + + + The Path header field indicates the route taken by an article since + its injection into the Netnews system. Each agent that processes an + article is required to prepend at least one <path-identity> to this + header field body. This is primarily so that news servers are able + to avoid sending articles to sites already known to have them, in + particular the site they came from. Additionally, it permits + gathering statistics and tracing the route articles take in moving + over the network. + + path = "Path:" SP *WSP path-list tail-entry *WSP CRLF + + path-list = *( path-identity [FWS] [path-diagnostic] "!" ) + + path-diagnostic = diag-match / diag-other / diag-deprecated + + diag-match = "!" ; another "!" + + diag-other = "!." diag-keyword [ "." diag-identity ] [FWS] + + diag-deprecated = "!" IPv4address [FWS] + + diag-keyword = 1*ALPHA ; see [RFC5537] + + diag-identity = path-identity / IPv4address / IPv6address + + tail-entry = path-nodot + ; may be the string "not-for-mail" + + path-identity = ( 1*( label "." ) toplabel ) / path-nodot + + path-nodot = 1*( alphanum / "-" / "_" ) ; legacy names + + label = alphanum [ *( alphanum / "-" ) alphanum ] + + toplabel = ( [ label *( "-" ) ] ALPHA *( "-" ) label ) / + ( label *( "-" ) ALPHA [ *( "-" ) label ] ) / + ( label 1*( "-" ) label ) + + alphanum = ALPHA / DIGIT ; compare [RFC3696] + + A <path-identity> is a name identifying a site. It takes the form of + a domain name having two or more components separated by dots, or a + single name with no dots (<path-nodot>). + + Each <path-identity> in the <path-list> (which does not include the + <tail-entry>) indicates, from right to left, the successive agents + through which the article has passed. The use of the <diag-match>, + which appears as "!!", indicates that the agent to its left verified + the identity of the agent to its right before accepting the article + (whereas the <path-delimiter> "!" implies no such claim). + + NOTE: Historically, the <tail-entry> indicated the name of the + sender. If not used for this purpose, the string "not-for-mail" + is often used instead (since at one time the whole path could be + used as a mail address for the sender). + + NOTE: Although case-insensitive, it is intended that the + <diag-keyword>s should be in uppercase, to distinguish them from + the <path-identity>s, which are traditionally in lowercase. + + A <path-diagnostic> is an item inserted into the Path header field + for purposes other than to indicate the name of a site. The use of + these is described in [RFC5537]. + + NOTE: One usage of a <path-diagnostic> is to record an IP address. + The fact that <IPv6address>es are allowed means that the colon (:) + is permitted; note that this may cause interoperability problems + at older sites that regard ":" as a <path-delimiter> and have + neighbors whose names have 4 or fewer characters, and where all + the characters are valid HEX digits. + + NOTE: Although <IPv4address>es have occasionally been used in the + past (usually with a diagnostic intent), their continued use is + deprecated (though it is still acceptable in the form of the + <diag-deprecated>). +""" + + +__lg = rply.LexerGenerator() +# Add takes a rule name, and a regular expression that defines the rule. +__lg.ignore(r"\s+") +__lg.ignore(r'[\n;=\)\(]+') +__lg.add('FROMSEP', r"(?i)\bfrom\b") +__lg.add('BYSEP', r"(?i)\bby\b") +__lg.add('VIASEP', r"(?i)\bvia\b") +__lg.add('WITHSEP', r"(?i)\bwith\b") +__lg.add('FORSEP', r"(?i)\bfor\b") +__lg.add('IDSEP', r"(?i)\bid\b") +__lg.add('TCPSEP', r"(?i)\btcp\b") +__lg.add('SMTPSEP', r"(?i)\b[e]?smtp\b") +__lg.add('IPV4ADDRESS', r"\[?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\]?") +__lg.add('SMTPVERSION', r"\d{1,2}\.\d{1,2}\.\d{1,2}/\d{1,2}\.\d{1,2}\.\d{1,2}") +__lg.add('DOMAIN', r"(?i)[<(]?(([\w][\w\-\.]*)\.)?([\w][\w\-]+)*" + + r"(\.([a-z][a-z]*))[>)]?") +__weekday = r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)" +__month_name = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)" +__lg.add('DATETIME', (r"%s\s*,\s+\d{1,2}\s+%s\s+\d{4} " + + r"\d{1,2}:\d{2}:\d{2}\s+[-+]\d{4}(\s*\([A-Z]{3}\))?") % + (__weekday, __month_name)) +__lg.add("EMAILADDR", r'<[a-zA-Z0-9_.+-]+@[a-zA-Z0-9._-]+>') +__lg.add('IGNORABLESTR', r"may be forged") +# __lg.add('ParentStr', r"\([^)]+\)") +__lg.add('STRING', r"\S+") + +lexer = __lg.build() + +__pg = rply.ParserGenerator([rule.name for rule in lexer.rules], + cache_id='received_parser') + +""" +Result of: + +from server.mymailhost.com + (mail.mymailhost.com [126.43.75.123]) + by pilot01.cl.msu.edu (8.10.2/8.10.2) with ESMTP id NAA23597; + Fri, 12 Jul 2002 16:11:20 -0400 (EDT) + +is + +[[Token('FROMSEP', 'from'), + [Token('DOMAIN', 'server.mymailhost.com'), + [[Token('DOMAIN', 'mail.mymailhost.com'), + Token('IPV4ADDRESS', '[126.43.75.123]')]]]], + [Token('BYSEP', 'by'), + [Token('DOMAIN', 'pilot01.cl.msu.edu'), + [[Token('STRING', '8.10.2/8.10.2)')]]]], + [[[], [[Token('SMTPSEP', 'ESMTP')]], [Token('STRING', 'NAA23597;')], []]], + Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')] +""" + + +@__pg.production('main : stamp') +def main(p): + return p[0] + + +@__pg.production('stamp : from-domain by-domain optinfo DATETIME') +def stamp(p): + #logging.debug('stamp p = %s', p) + return [p[0], p[1], p[2], p[3]] + + +@__pg.production('from-domain : FROMSEP extended-domain') +def from_domain(p): + #logging.debug('from_domain p = %s', p) + return [p[0], p[1]] + + +@__pg.production('by-domain : BYSEP extended-domain') +def by_domain(p): + #logging.debug('by_domain p = %s', p) + return [p[0], p[1]] + + +@__pg.production('extended-domain : DOMAIN') +def extended_domain(p): + #logging.debug('extended_domain p = %s', p) + return p[0] + + +@__pg.production('extended-domain : DOMAIN tcp-info') +def extended_domain_tcp(p): + #logging.debug('extended_domain_tcp p = %s', p) + return [p[0], p[1]] + + +@__pg.production('extended-domain : IPV4ADDRESS tcp-info') +def extended_domain_addr(p): + #logging.debug('extended_domain_addr p = %s', p) + return [p[0], p[1]] + + +@__pg.production('extended-domain : extended-domain IGNORABLESTR') +def extended_domain_ignorable(p): + #logging.debug('extended_domain_ignorable p = %s', p) + return p[0] + + +@__pg.production('tcp-info : IPV4ADDRESS') +def tcp_info(p): + #logging.debug('optinfo p = %s', p) + return p + + +@__pg.production('tcp-info : DOMAIN IPV4ADDRESS') +def tcp_info_addr(p): + #logging.debug('optinfo p = %s', p) + return p + + +@__pg.production('tcp-info : SMTPVERSION') +def tcp_info_string(p): + #logging.debug('optinfo p = %s', p) + return p + + +@__pg.production('optinfo : via with id for') +def optinfo(p): + #logging.debug('optinfo p = %s', p) + return [p] + + +@__pg.production('via : ') +def via_empty(p): + #logging.debug('optinfo_via p = %s', p) + return p + + +@__pg.production('via : VIASEP link') +def via(p): + #logging.debug('optinfo_via p = %s', p) + return [p[2]] + + +@__pg.production('with :') +def with_empty(p): + #logging.debug('optinfo_with p = %s', p) + return p + + +@__pg.production('with : WITHSEP protocol') +def with_protocol(p): + #logging.debug('optinfo_with p = %s', p) + return p[1] + + +@__pg.production('id : ') +def id_empty(p): + #logging.debug('optinfo_id p = %s', p) + return p + + +@__pg.production('id : IDSEP STRING') +def id(p): + #logging.debug('optinfo_id p = %s', p) + return [p[1]] + + +# Actually EMAILADDR is Message-Id +@__pg.production('id : IDSEP EMAILADDR') +def id_addr(p): + #logging.debug('id_addr p = %s', p) + return [p[1]] + + +@__pg.production('for : ') +def for_empty(p): + #logging.debug('for p = %s', p) + return p + + +# For = "FOR" FWS 1*( Path / Mailbox ) CFWS +@__pg.production('for : FORSEP EMAILADDR') +def optinfo_for(p): + #logging.debug('for p = %s', p) + return [p[0]] + + +@__pg.production('link : TCPSEP STRING') +def link(p): + #logging.debug('link p = %s', p) + return [p[2]] + + +@__pg.production('protocol : SMTPSEP') +def protocol(p): + #logging.debug('protocol p = %s', p) + return p + + +parser = __pg.build() + + +# from http://stackoverflow.com/questions/27450365/ +def sort_key(received_header): + received_date = email.utils.parsedate_tz(received_header) + return received_date + +# [[Token('FROMSEP', 'from'), +# [Token('DOMAIN', 'server.mymailhost.com'), +# [[Token('DOMAIN', 'mail.mymailhost.com'), +# Token('IPV4ADDRESS', '[126.43.75.123]')]]]], +# [Token('BYSEP', 'by'), +# [Token('DOMAIN', 'pilot01.cl.msu.edu'), +# Token('SMTPVERSION', '8.10.2/8.10.2')]], +# [[[], [Token('SMTPSEP', 'ESMTP')], [Token('STRING', 'NAA23597;')], []]], +# Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')] +# +# [[Token('FROMSEP', 'from'), +# [Token('DOMAIN', 'd1080.master.cz'), +# [[Token('DOMAIN', 'p-lab.cz'), +# Token('IPV4ADDRESS', '[89.185.245.149]')]]]], +# [Token('BYSEP', 'by'), +# [Token('DOMAIN', 'mx1.redhat.com'), +# Token('SMTPVERSION', '8.14.4/8.14.4')]], +# [[[], +# [Token('SMTPSEP', 'ESMTP')], +# [Token('STRING', 't07GaC1j031854')], +# [Token('FORSEP', 'for')]]], +# Token('DATETIME', 'Wed, 7 Jan 2015 11:36:13 -0500')] + + +def parse_header(in_str): + # Path: cbosgd!mhuxj!mhuxt!eagle!jerry + stream = lexer.lex(in_str) + logging.debug('\nstream:\n%s', list(stream)) + stream.idx = 0 + parsed = parser.parse(stream) + logging.debug('\nparsed:\n%s', pformat(parsed)) + + from_list = parsed[0][1] + logging.debug('\nfrom_list = %s', from_list[1]) + by_list = parsed[1][1] + logging.debug('by_list = %s', by_list) + with_list = parsed[2] + logging.debug('with_list = %s', with_list) + date_str = parsed[3].getstr() + logging.debug('date_str = %s', date_str) + + out = { + 'from': { + 'halo': from_list[0].getstr(), + 'reveresed': from_list[1][0].getstr(), + 'ipaddr': from_list[1][1].getstr() + }, + 'by': {'server': by_list[0].getstr()}, + 'date': dateutil.parser.parse(date_str) + } + + return out + +received_header_list = [] +received_header_list.sort(key=sort_key) + +if __name__ == '__main__': + instr = """from d1080.master.cz (p-lab.cz [89.185.245.149] (may be forged)) + by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaC1j031854 + for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:13 -0500""" + print(instr) + for token in lexer.lex(instr): + print(token) diff --git a/test/samples/testing-email-01.eml b/test/samples/testing-email-01.eml new file mode 100644 index 0000000..9c00523 --- /dev/null +++ b/test/samples/testing-email-01.eml @@ -0,0 +1,129 @@ +From mcepl@redhat.com Wed Jan 7 17:36:15 2015 +Return-Path: mcepl@redhat.com +Received: from zmta02.collab.prod.int.phx2.redhat.com (LHLO + zmta02.collab.prod.int.phx2.redhat.com) (10.5.81.9) by + zmail13.collab.prod.int.phx2.redhat.com with LMTP; Wed, 7 Jan 2015 11:36:14 + -0500 (EST) +Received: from int-mx14.intmail.prod.int.phx2.redhat.com (int-mx14.intmail.prod.int.phx2.redhat.com [10.5.11.27]) + by zmta02.collab.prod.int.phx2.redhat.com (Postfix) with ESMTP id D959F121409 + for <mcepl@mail.corp.redhat.com>; Wed, 7 Jan 2015 11:36:14 -0500 (EST) +Received: from mx1.redhat.com (ext-mx12.extmail.prod.ext.phx2.redhat.com [10.5.110.17]) + by int-mx14.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaEPV007184 + for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:14 -0500 +Received: from d1080.master.cz (p-lab.cz [89.185.245.149] (may be forged)) + by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaC1j031854 + for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:13 -0500 +Received: by d1080.master.cz (Postfix) + id 1932D3218192; Wed, 7 Jan 2015 17:36:12 +0100 (CET) +Delivered-To: virtuser_566@d1080.master.cz +Received: by d1080.master.cz (Postfix, from userid 557) + id 113733218161; Wed, 7 Jan 2015 17:36:12 +0100 (CET) +X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on p-lab.cz +X-Spam-Level: +X-Spam-Status: No, score=-105.0 required=2.0 tests=RCVD_IN_DNSWL_HI, + SPF_HELO_PASS,URIBL_BLOCKED,USER_IN_ALL_SPAM_TO autolearn=disabled + version=3.3.1 +Received: from mx1.redhat.com (mx1.redhat.com [209.132.183.28]) + by d1080.master.cz (Postfix) with ESMTP id 80EB93218161 + for <mcepl@cepl.eu>; Wed, 7 Jan 2015 17:36:10 +0100 (CET) +Received: from int-mx11.intmail.prod.int.phx2.redhat.com (int-mx11.intmail.prod.int.phx2.redhat.com [10.5.11.24]) + by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07Ga8gQ031700 + (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=FAIL); + Wed, 7 Jan 2015 11:36:09 -0500 +Received: from mitmanek.localdomain (ovpn-116-62.ams2.redhat.com [10.36.116.62]) + by int-mx11.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id t07Ga7QL024180; + Wed, 7 Jan 2015 11:36:08 -0500 +Received: from mitmanek.redhat.com (mitmanek.ceplovi.cz [127.0.0.1]) + by mitmanek.localdomain (Postfix) with ESMTP id 44EA710093A06; + Wed, 7 Jan 2015 17:36:06 +0100 (CET) +From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= <mcepl@redhat.com> +To: piglit@lists.freedesktop.org +Cc: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= <mcepl@cepl.eu> +Subject: [PATCH] Do we really need python 2.7? +Date: Wed, 7 Jan 2015 17:36:00 +0100 +Message-Id: <1420648560-24144-1-git-send-email-mcepl@redhat.com> +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +X-Scanned-By: MIMEDefang 2.68 on 10.5.11.27 +X-Scanned-By: MIMEDefang 2.68 on 10.5.110.17 +X-Scanned-By: MIMEDefang 2.68 on 10.5.11.24 +Content-Transfer-Encoding: quoted-printable +X-RedHat-Spam-Score: -1.106 (BAYES_00,RDNS_NONE,URIBL_BLOCKED) 89.185.245.149 [89.185.245.149] 89.185.245.149 [89.185.245.149] <mcepl@redhat.com> +Status: RO +Content-Length: 2410 +Lines: 73 + +From: Mat=C4=9Bj Cepl <mcepl@cepl.eu> + +Hi, + +I am in the process of struggle to build piglit from the master +(commit 4adb082) on RHEL-6 (we would like to continue to use it +for testing). + +When I tried just naively build our RHEL-7/Fedora 20+ package on +EL-6 configuration failed because it complained it is missing +python 2.6. Which is a bit strange, because of course, RHEL-6 +does contain python 2.6. So, I went digging to find out where +this message comes from and I have discovered this interesting +piece of code: + + # Check for presence of Python 2.6 or greater. + foreach(python_cmd python2 python) + execute_process( + COMMAND ${python_cmd} -c \ + "import sys; assert '2.7' <=3D sys.version < '3'" + OUTPUT_QUIET + ERROR_QUIET + RESULT_VARIABLE python_version_check_error_code) + if(python_version_check_error_code EQUAL 0) + set(python ${python_cmd}) + break() + endif(python_version_check_error_code EQUAL 0) + endforeach(python_cmd) + + if(NOT DEFINED python) + message(FATAL_ERROR "python version 2.x (where x >=3D 6) requ= +ired") + endif(NOT DEFINED python) + +First obviously this code lies. Either we really care about +python 2.7 and we should declare our loyalities openly, or piglit +can be working with python 2.6 (which I hope) and then that '2.7' +is just a typo. + +Which one it is? How difficult it would be to switch piglit to +python 2.6? I hope not that difficult (as there were not that +many changes between 2.6 and 2.7). Does anybody know? + +And of course, when I apply just the attached patch I get to yet +another problem: freeglut (RHEL-6 has freeglut-2.6.0). But let's +walk this minefield one mine at the time. + +Best, + +Mat=C4=9Bj + +--- + CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 6fb6c8a..d2f2f0e 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -190,7 +190,7 @@ ENDIF() + # Check for presence of Python 2.6 or greater. + foreach(python_cmd python2 python) + execute_process( +- COMMAND ${python_cmd} -c "import sys; assert '2.7' <=3D sys.version < = +'3'" ++ COMMAND ${python_cmd} -c "import sys; assert '2.6' <=3D sys.version < = +'3'" + OUTPUT_QUIET + ERROR_QUIET + RESULT_VARIABLE python_version_check_error_code) +--=20 +1.8.3.1 + + diff --git a/test/test_parse_received.py b/test/test_parse_received.py new file mode 100644 index 0000000..8fa1945 --- /dev/null +++ b/test/test_parse_received.py @@ -0,0 +1,157 @@ +import datetime +import email +import unittest + +from dateutil.tz import tzoffset +from rply import Token + +import parse_received + + +#logging.basicConfig(level=logging.DEBUG) + + +INPUT_01 = \ + """from server.mymailhost.com + (mail.mymailhost.com [126.43.75.123]) + by pilot01.cl.msu.edu (8.10.2/8.10.2) with ESMTP id NAA23597; + Fri, 12 Jul 2002 16:11:20 -0400 (EDT)""" +INPUT_02 = \ + """from d1080.master.cz (p-lab.cz [89.185.245.149] (may be forged)) + by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaC1j031854 + for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:13 -0500""" + + +class TestReceivedLexer(unittest.TestCase): + maxDiff = None + + def test_simple_01(self): + expected = [ + Token('FROMSEP', 'from'), + Token('DOMAIN', 'server.mymailhost.com'), + Token('DOMAIN', 'mail.mymailhost.com'), + Token('IPV4ADDRESS', '[126.43.75.123]'), + Token('BYSEP', 'by'), + Token('DOMAIN', 'pilot01.cl.msu.edu'), + Token('SMTPVERSION', '8.10.2/8.10.2'), + Token('WITHSEP', 'with'), + Token('SMTPSEP', 'ESMTP'), + Token('IDSEP', 'id'), + Token('STRING', 'NAA23597;'), + Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)') + ] + parsed = list(parse_received.lexer.lex(INPUT_01)) + self.assertEqual(parsed, expected) + + def test_simple_02(self): + expected = [ + Token('FROMSEP', 'from'), + Token('DOMAIN', 'd1080.master.cz'), + Token('DOMAIN', 'p-lab.cz'), + Token('IPV4ADDRESS', '[89.185.245.149]'), + Token('IGNORABLESTR', 'may be forged'), + Token('BYSEP', 'by'), + Token('DOMAIN', 'mx1.redhat.com'), + Token('SMTPVERSION', '8.14.4/8.14.4'), + Token('WITHSEP', 'with'), + Token('SMTPSEP', 'ESMTP'), + Token('IDSEP', 'id'), + Token('STRING', 't07GaC1j031854'), + Token('FORSEP', 'for'), + Token('EMAILADDR', '<mcepl@redhat.com>'), + Token('DATETIME', 'Wed, 7 Jan 2015 11:36:13 -0500') + ] + parsed = list(parse_received.lexer.lex(INPUT_02)) + self.assertEqual(parsed, expected) + + +class TestReceivedParser(unittest.TestCase): + maxDiff = None + + def test_simple_01(self): + expected = [ + [Token('FROMSEP', 'from'), + [Token('DOMAIN', 'server.mymailhost.com'), + [Token('DOMAIN', 'mail.mymailhost.com'), + Token('IPV4ADDRESS', '[126.43.75.123]')]] + ], + [Token('BYSEP', 'by'), + [Token('DOMAIN', 'pilot01.cl.msu.edu'), + [Token('SMTPVERSION', '8.10.2/8.10.2')]] + ], + [[[], + [Token('SMTPSEP', 'ESMTP')], + [Token('STRING', 'NAA23597;')], []] + ], + Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)') + ] + stream = parse_received.lexer.lex(INPUT_01) + stream.idx = 0 + parsed = parse_received.parser.parse(stream) + self.assertEqual(parsed, expected) + + def test_simple_02(self): + expected = [ + [Token('FROMSEP', 'from'), + [Token('DOMAIN', 'd1080.master.cz'), + [Token('DOMAIN', 'p-lab.cz'), + Token('IPV4ADDRESS', '[89.185.245.149]')]] + ], + [Token('BYSEP', 'by'), + [Token('DOMAIN', 'mx1.redhat.com'), + [Token('SMTPVERSION', '8.14.4/8.14.4')]] + ], + [[[], + [Token('SMTPSEP', 'ESMTP')], + [Token('STRING', 't07GaC1j031854')], + [Token('FORSEP', 'for')]] + ], + Token('DATETIME', 'Wed, 7 Jan 2015 11:36:13 -0500') + ] + stream = parse_received.lexer.lex(INPUT_02) + stream.idx = 0 + parsed = parse_received.parser.parse(stream) + self.assertEqual(parsed, expected) + + def test_parse_01(self): + expected = { + 'by': {'server': 'pilot01.cl.msu.edu'}, + 'date': datetime.datetime( + 2002, 7, 12, 16, 11, 20, tzinfo=tzoffset(u'EDT', -14400)), + 'from': { + 'halo': 'server.mymailhost.com', + 'ipaddr': '[126.43.75.123]', + 'reveresed': 'mail.mymailhost.com' + } + } + + observed = parse_received.parse_header(INPUT_01) + self.assertEqual(observed, expected) + + def test_parse_02(self): + expected = { + 'by': {'server': 'mx1.redhat.com'}, + 'date': datetime.datetime( + 2015, 1, 7, 11, 36, 13, tzinfo=tzoffset(None, -18000)), + 'from': { + 'halo': 'd1080.master.cz', + 'ipaddr': '[89.185.245.149]', + 'reveresed': 'p-lab.cz' + } + } + + observed = parse_received.parse_header(INPUT_02) + self.assertEqual(observed, expected) + + def test_email_01(self): + with open('examples/mail', 'r') as inmail: + msg = email.message_from_file(inmail) + received_hdrs = msg.get_all('Received') + expected = [] + parsed_headers = [] + for hdr in received_hdrs: + parsed_headers.append(parse_received.parse_header(hdr)) + self.assertEqual(parsed_headers, expected) + +if __name__ == '__main__': + unittest.main() |