package parse import ( "bytes" "io" "regexp" "sort" ) // Partial regexp to match the beginning of URLs and email addresses. // The remainder of the matched URLs/emails is parsed manually. var urlRe = regexp.MustCompile( `([a-z]{2,8})://` + // URL start `|` + // or `(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start ) // HttpLinks searches a reader for a http link and returns a copy of the // reader and a slice with links. func HttpLinks(r io.Reader) (io.Reader, []string) { buf, err := io.ReadAll(r) if err != nil { return r, nil } links := make(map[string]bool) b := buf match := urlRe.FindSubmatchIndex(b) for ; match != nil; match = urlRe.FindSubmatchIndex(b) { // Regular expressions do not really cut it here and we // need to detect opening/closing braces to handle // markdown link syntax. var paren, bracket, ltgt, scheme int var emitUrl bool i, j := match[0], match[1] b = b[i:] scheme = j - i j = scheme // "inline" email without a mailto: prefix - add some extra checks for those inlineEmail := len(match) > 4 && match[2] == -1 && match[4] == -1 for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 { switch b[j] { case '[': bracket++ j++ case '(': paren++ j++ case '<': ltgt++ j++ case ']': bracket-- if bracket < 0 { emitUrl = true } else { j++ } case ')': paren-- if paren < 0 { emitUrl = true } else { j++ } case '>': ltgt-- if ltgt < 0 { emitUrl = true } else { j++ } case '&': if inlineEmail { emitUrl = true } else { j++ } default: j++ } // we don't want those in inline emails if inlineEmail && (paren > 0 || ltgt > 0 || bracket > 0) { j-- emitUrl = true } } // Heuristic to remove trailing characters that are // valid URL characters, but typically not at the end of // the URL for trim := true; trim && j > 0; { switch b[j-1] { case '.', ',', ':', ';', '?', '!', '"', '\'', '%': j-- default: trim = false } } if j == scheme { // Only an URL scheme, ignore. b = b[j:] continue } url := string(b[:j]) if inlineEmail { // Email address with missing mailto: scheme. Add it. url = "mailto:" + url } links[url] = true b = b[j:] } results := make([]string, 0, len(links)) for link := range links { results = append(results, link) } sort.Strings(results) return bytes.NewReader(buf), results } var urichars = []byte( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]", )