From 57088312fdd8e602a084bd5736a0e22a34be9ec0 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Tue, 17 Oct 2023 14:40:08 +0200 Subject: worker: move shared code to lib Avoid importing code from worker/lib into lib. It should only be the other way around. Move the message parsing code used by maildir, notmuch, mbox and the eml viewer into a lib/rfc822 package. Adapt imports accordingly. Signed-off-by: Robin Jarry Reviewed-by: Koni Marti Tested-by: Moritz Poldrack Tested-by: Inwit --- lib/rfc822/message.go | 383 +++++++++++++++++++++ lib/rfc822/message_test.go | 84 +++++ lib/rfc822/testdata/message/invalid/hexa | 26 ++ lib/rfc822/testdata/message/valid/quoted-mime-type | 45 +++ 4 files changed, 538 insertions(+) create mode 100644 lib/rfc822/message.go create mode 100644 lib/rfc822/message_test.go create mode 100644 lib/rfc822/testdata/message/invalid/hexa create mode 100644 lib/rfc822/testdata/message/valid/quoted-mime-type (limited to 'lib/rfc822') diff --git a/lib/rfc822/message.go b/lib/rfc822/message.go new file mode 100644 index 00000000..979d4595 --- /dev/null +++ b/lib/rfc822/message.go @@ -0,0 +1,383 @@ +package rfc822 + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "regexp" + "strings" + "time" + + "git.sr.ht/~rjarry/aerc/lib/parse" + "git.sr.ht/~rjarry/aerc/log" + "git.sr.ht/~rjarry/aerc/models" + "github.com/emersion/go-message" + _ "github.com/emersion/go-message/charset" + "github.com/emersion/go-message/mail" +) + +// RFC 1123Z regexp +var dateRe = regexp.MustCompile(`(((Mon|Tue|Wed|Thu|Fri|Sat|Sun))[,]?\s[0-9]{1,2})\s` + + `(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s` + + `([0-9]{4})\s([0-9]{2}):([0-9]{2})(:([0-9]{2}))?\s([\+|\-][0-9]{4})`) + +func FetchEntityPartReader(e *message.Entity, index []int) (io.Reader, error) { + if len(index) == 0 { + // non multipart, simply return everything + return bufReader(e) + } + if mpr := e.MultipartReader(); mpr != nil { + idx := 0 + for { + idx++ + part, err := mpr.NextPart() + if err != nil { + return nil, err + } + if idx == index[0] { + rest := index[1:] + if len(rest) < 1 { + return bufReader(part) + } + return FetchEntityPartReader(part, index[1:]) + } + } + } + return nil, fmt.Errorf("FetchEntityPartReader: unexpected code reached") +} + +// TODO: the UI doesn't seem to like readers which aren't buffers +func bufReader(e *message.Entity) (io.Reader, error) { + var buf bytes.Buffer + if _, err := io.Copy(&buf, e.Body); err != nil { + return nil, err + } + return &buf, nil +} + +// split a MIME type into its major and minor parts +func splitMIME(m string) (string, string) { + parts := strings.Split(m, "/") + if len(parts) != 2 { + return parts[0], "" + } + return parts[0], parts[1] +} + +func fixContentType(h message.Header) (string, map[string]string) { + ct, rest := h.Get("Content-Type"), "" + if i := strings.Index(ct, ";"); i > 0 { + ct, rest = ct[:i], ct[i:] + } + + // check if there are quotes around the content type + if strings.Contains(ct, "\"") { + header := strings.ReplaceAll(ct, "\"", "") + if rest != "" { + header += rest + } + h.Set("Content-Type", header) + if contenttype, params, err := h.ContentType(); err == nil { + return contenttype, params + } + } + + // if all else fails, return text/plain + return "text/plain", nil +} + +func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) { + var body models.BodyStructure + contentType, ctParams, err := e.Header.ContentType() + if err != nil { + // try to fix the error; if all measures fail, then return a + // text/plain content type to display at least plaintext + contentType, ctParams = fixContentType(e.Header) + } + + mimeType, mimeSubType := splitMIME(contentType) + body.MIMEType = mimeType + body.MIMESubType = mimeSubType + body.Params = ctParams + body.Description = e.Header.Get("content-description") + body.Encoding = e.Header.Get("content-transfer-encoding") + if cd := e.Header.Get("content-disposition"); cd != "" { + contentDisposition, cdParams, err := e.Header.ContentDisposition() + if err != nil { + return nil, fmt.Errorf("could not parse content disposition: %w", err) + } + body.Disposition = contentDisposition + body.DispositionParams = cdParams + } + body.Parts = []*models.BodyStructure{} + if mpr := e.MultipartReader(); mpr != nil { + for { + part, err := mpr.NextPart() + if errors.Is(err, io.EOF) { + return &body, nil + } else if err != nil { + return nil, err + } + ps, err := ParseEntityStructure(part) + if err != nil { + return nil, fmt.Errorf("could not parse child entity structure: %w", err) + } + body.Parts = append(body.Parts, ps) + } + } + return &body, nil +} + +var DateParseError = errors.New("date parsing failed") + +func parseEnvelope(h *mail.Header) (*models.Envelope, error) { + from, err := parseAddressList(h, "from") + if err != nil { + return nil, fmt.Errorf("could not read from address: %w", err) + } + to, err := parseAddressList(h, "to") + if err != nil { + return nil, fmt.Errorf("could not read to address: %w", err) + } + cc, err := parseAddressList(h, "cc") + if err != nil { + return nil, fmt.Errorf("could not read cc address: %w", err) + } + bcc, err := parseAddressList(h, "bcc") + if err != nil { + return nil, fmt.Errorf("could not read bcc address: %w", err) + } + replyTo, err := parseAddressList(h, "reply-to") + if err != nil { + return nil, fmt.Errorf("could not read reply-to address: %w", err) + } + subj, err := h.Subject() + if err != nil { + return nil, fmt.Errorf("could not read subject: %w", err) + } + msgID, err := h.MessageID() + if err != nil { + // proper parsing failed, so fall back to whatever is there + msgID, err = h.Text("message-id") + if err != nil { + return nil, err + } + } + var irt string + irtList := parse.MsgIDList(h, "in-reply-to") + if len(irtList) > 0 { + irt = irtList[0] + } + date, err := parseDate(h) + if err != nil { + // still return a valid struct plus a sentinel date parsing error + // if only the date parsing failed + err = fmt.Errorf("%w: %v", DateParseError, err) //nolint:errorlint // can only use %w once + } + return &models.Envelope{ + Date: date, + Subject: subj, + MessageId: msgID, + From: from, + ReplyTo: replyTo, + To: to, + Cc: cc, + Bcc: bcc, + InReplyTo: irt, + }, err +} + +// parseDate tries to parse the date from the Date header with non std formats +// if this fails it tries to parse the received header as well +func parseDate(h *mail.Header) (time.Time, error) { + t, err := h.Date() + if err == nil { + return t, nil + } + text, err := h.Text("date") + // sometimes, no error occurs but the date is empty. + // In this case, guess time from received header field + if err != nil || text == "" { + t, err := parseReceivedHeader(h) + if err == nil { + return t, nil + } + } + layouts := []string{ + // X-Mailer: EarthLink Zoo Mail 1.0 + "Mon, _2 Jan 2006 15:04:05 -0700 (GMT-07:00)", + } + for _, layout := range layouts { + if t, err := time.Parse(layout, text); err == nil { + return t, nil + } + } + // still no success, try the received header as a last resort + t, err = parseReceivedHeader(h) + if err != nil { + return time.Time{}, fmt.Errorf("unrecognized date format: %s", text) + } + return t, nil +} + +func parseReceivedHeader(h *mail.Header) (time.Time, error) { + guess, err := h.Text("received") + if err != nil { + return time.Time{}, fmt.Errorf("received header not parseable: %w", + err) + } + return time.Parse(time.RFC1123Z, dateRe.FindString(guess)) +} + +func parseAddressList(h *mail.Header, key string) ([]*mail.Address, error) { + hdr, err := h.Text(key) + if err != nil && !message.IsUnknownCharset(err) { + return nil, err + } + if hdr == "" { + return nil, nil + } + add, err := mail.ParseAddressList(hdr) + if err != nil { + return []*mail.Address{{Name: hdr}}, nil + } + return add, err +} + +// RawMessage is an interface that describes a raw message +type RawMessage interface { + NewReader() (io.ReadCloser, error) + ModelFlags() (models.Flags, error) + Labels() ([]string, error) + UID() uint32 +} + +// MessageInfo populates a models.MessageInfo struct for the message. +// based on the reader returned by NewReader +func MessageInfo(raw RawMessage) (*models.MessageInfo, error) { + var parseErr error + r, err := raw.NewReader() + if err != nil { + return nil, err + } + defer r.Close() + msg, err := ReadMessage(r) + if err != nil { + return nil, fmt.Errorf("could not read message: %w", err) + } + bs, err := ParseEntityStructure(msg) + if errors.As(err, new(message.UnknownEncodingError)) { + parseErr = err + } else if err != nil { + return nil, fmt.Errorf("could not get structure: %w", err) + } + h := &mail.Header{Header: msg.Header} + env, err := parseEnvelope(h) + if err != nil && !errors.Is(err, DateParseError) { + return nil, fmt.Errorf("could not parse envelope: %w", err) + // if only the date parsing failed we still get the rest of the + // envelop structure in a valid state. + // Date parsing errors are fairly common and it's better to be + // slightly off than to not be able to read the mails at all + // hence we continue here + } + recDate, _ := parseReceivedHeader(h) + if recDate.IsZero() { + // better than nothing, if incorrect + recDate = env.Date + } + flags, err := raw.ModelFlags() + if err != nil { + return nil, err + } + labels, err := raw.Labels() + if err != nil { + return nil, err + } + return &models.MessageInfo{ + BodyStructure: bs, + Envelope: env, + Flags: flags, + Labels: labels, + InternalDate: recDate, + RFC822Headers: h, + Size: 0, + Uid: raw.UID(), + Error: parseErr, + }, nil +} + +// MessageHeaders populates a models.MessageInfo struct for the message. +// based on the reader returned by NewReader. Minimal information is included. +// There is no body structure or RFC822Headers set +func MessageHeaders(raw RawMessage) (*models.MessageInfo, error) { + var parseErr error + r, err := raw.NewReader() + if err != nil { + return nil, err + } + defer r.Close() + msg, err := ReadMessage(r) + if err != nil { + return nil, fmt.Errorf("could not read message: %w", err) + } + h := &mail.Header{Header: msg.Header} + env, err := parseEnvelope(h) + if err != nil && !errors.Is(err, DateParseError) { + return nil, fmt.Errorf("could not parse envelope: %w", err) + // if only the date parsing failed we still get the rest of the + // envelop structure in a valid state. + // Date parsing errors are fairly common and it's better to be + // slightly off than to not be able to read the mails at all + // hence we continue here + } + recDate, _ := parseReceivedHeader(h) + if recDate.IsZero() { + // better than nothing, if incorrect + recDate = env.Date + } + flags, err := raw.ModelFlags() + if err != nil { + return nil, err + } + labels, err := raw.Labels() + if err != nil { + return nil, err + } + return &models.MessageInfo{ + Envelope: env, + Flags: flags, + Labels: labels, + InternalDate: recDate, + Refs: parse.MsgIDList(h, "references"), + Size: 0, + Uid: raw.UID(), + Error: parseErr, + }, nil +} + +// NewCRLFReader returns a reader with CRLF line endings +func NewCRLFReader(r io.Reader) io.Reader { + var buf bytes.Buffer + scanner := bufio.NewScanner(r) + for scanner.Scan() { + buf.WriteString(scanner.Text() + "\r\n") + } + return &buf +} + +// ReadMessage is a wrapper for the message.Read function to read a message +// from r. The message's encoding and charset are automatically decoded to +// UTF-8. If an unknown charset is encountered, the error is logged but a nil +// error is returned since the entity object can still be read. +func ReadMessage(r io.Reader) (*message.Entity, error) { + entity, err := message.Read(r) + if message.IsUnknownCharset(err) { + log.Warnf("unknown charset encountered") + } else if err != nil { + return nil, fmt.Errorf("could not read message: %w", err) + } + return entity, nil +} diff --git a/lib/rfc822/message_test.go b/lib/rfc822/message_test.go new file mode 100644 index 00000000..8730afe2 --- /dev/null +++ b/lib/rfc822/message_test.go @@ -0,0 +1,84 @@ +package rfc822_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "git.sr.ht/~rjarry/aerc/lib/rfc822" + "git.sr.ht/~rjarry/aerc/models" +) + +func TestMessageInfoParser(t *testing.T) { + rootDir := "testdata/message/valid" + msgFiles, err := os.ReadDir(rootDir) + die(err) + + for _, fi := range msgFiles { + if fi.IsDir() { + continue + } + + p := fi.Name() + t.Run(p, func(t *testing.T) { + m := newMockRawMessageFromPath(filepath.Join(rootDir, p)) + mi, err := rfc822.MessageInfo(m) + if err != nil { + t.Fatal("Failed to create MessageInfo with:", err) + } + + if perr := mi.Error; perr != nil { + t.Fatal("Expected no parsing error, but got:", mi.Error) + } + }) + } +} + +func TestMessageInfoHandledError(t *testing.T) { + rootDir := "testdata/message/invalid" + msgFiles, err := os.ReadDir(rootDir) + die(err) + + for _, fi := range msgFiles { + if fi.IsDir() { + continue + } + + p := fi.Name() + t.Run(p, func(t *testing.T) { + m := newMockRawMessageFromPath(filepath.Join(rootDir, p)) + mi, err := rfc822.MessageInfo(m) + if err != nil { + t.Fatal(err) + } + + if perr := mi.Error; perr == nil { + t.Fatal("Expected MessageInfo.Error, got none") + } + }) + } +} + +type mockRawMessage struct { + path string +} + +func newMockRawMessageFromPath(p string) *mockRawMessage { + return &mockRawMessage{ + path: p, + } +} + +func (m *mockRawMessage) NewReader() (io.ReadCloser, error) { + return os.Open(m.path) +} +func (m *mockRawMessage) ModelFlags() (models.Flags, error) { return 0, nil } +func (m *mockRawMessage) Labels() ([]string, error) { return nil, nil } +func (m *mockRawMessage) UID() uint32 { return 0 } + +func die(err error) { + if err != nil { + panic(err) + } +} diff --git a/lib/rfc822/testdata/message/invalid/hexa b/lib/rfc822/testdata/message/invalid/hexa new file mode 100644 index 00000000..56b352ff --- /dev/null +++ b/lib/rfc822/testdata/message/invalid/hexa @@ -0,0 +1,26 @@ +Subject: Confirmation Needed gUdVJQBhsd +Content-Type: multipart/mixed; boundary="Nextpart_1Q2YJhd197991794467076Pgfa" +To: +From: ""REGISTRAR"" + +--Nextpart_1Q2YJhd197991794467076Pgfa +Content-Type: multipart/parallel; boundary="sg54sd54g54sdg54" + +--sg54sd54g54sdg54 +Content-Type: multipart/alternative; boundary="54qgf54q546f46qsf46qsf" + +--54qgf54q546f46qsf46qsf +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: Hexa + + + +--54qgf54q546f46qsf46qsf +Content-Type: text/html; charset=utf-8 + + +

Congratulations Netflix Customer!


+ + + +--Nextpart_1Q2YJhd197991794467076Pgfa-- diff --git a/lib/rfc822/testdata/message/valid/quoted-mime-type b/lib/rfc822/testdata/message/valid/quoted-mime-type new file mode 100644 index 00000000..d9af28a2 --- /dev/null +++ b/lib/rfc822/testdata/message/valid/quoted-mime-type @@ -0,0 +1,45 @@ +Subject: Your ECOLINES tickets +X-PHP-Originating-Script: 33:functions.inc.php +From: ECOLINES +Content-Type: multipart/mixed; + boundary="PHP-mixed-ba319678ca12656cfb8cd46e736ce09d" +Message-Id: +Date: Sun, 29 May 2022 15:53:44 +0300 + +--PHP-mixed-ba319678ca12656cfb8cd46e736ce09d +Content-Type: multipart/alternative; boundary="PHP-alt-ba319678ca12656cfb8cd46e736ce09d" + +--PHP-alt-ba319678ca12656cfb8cd46e736ce09d +Content-Type: text/plain; charset="UTF-8" +Content-Transfer-Encoding: 7bit + +Your tickets are attached to this message. Also You can print out Your tickets from our website www.ecolines.net +… + +--PHP-alt-ba319678ca12656cfb8cd46e736ce09d +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: 7bit + + +… + +--PHP-alt-ba319678ca12656cfb8cd46e736ce09d-- + +--PHP-mixed-ba319678ca12656cfb8cd46e736ce09d +Content-Type: "application/pdf"; name="17634428.pdf" +Content-Disposition: attachment; filename="17634428.pdf" +Content-Transfer-Encoding: base64 + +JVBERi0xLjQKMSAwIG9iago8PAovVGl0bGUgKP7/AFkAbwB1AHIAIAB0AGkAYwBrAGUAdCkKL0Ny +… + +--PHP-mixed-ba319678ca12656cfb8cd46e736ce09d +Content-Type: "application/pdf"; name="invoice-6385490.pdf" +Content-Disposition: attachment; filename="invoice-6385490.pdf" +Content-Transfer-Encoding: base64 + +JVBERi0xLjQKMSAwIG9iago8PAovVGl0bGUgKP7/AEkAbgB2AG8AaQBjAGUpCi9DcmVhdG9yICj+ +… + +--PHP-mixed-ba319678ca12656cfb8cd46e736ce09d-- -- cgit