diff --git a/go/go.mod b/go/go.mod index 9b52481202..3470cf8946 100644 --- a/go/go.mod +++ b/go/go.mod @@ -10,6 +10,7 @@ require ( github.com/dghubble/gologin/v2 v2.2.0 github.com/dghubble/oauth1 v0.6.0 github.com/dgrijalva/jwt-go v3.2.0+incompatible + github.com/google/safehtml v0.0.2 // indirect github.com/gorilla/mux v1.8.0 github.com/gorilla/securecookie v1.1.1 github.com/gorilla/sessions v1.2.1 diff --git a/go/go.sum b/go/go.sum index 6934dfff90..1473abef54 100644 --- a/go/go.sum +++ b/go/go.sum @@ -37,6 +37,8 @@ github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/safehtml v0.0.2 h1:ZOt2VXg4x24bW0m2jtzAOkhoXV0iM8vNKc0paByCZqM= +github.com/google/safehtml v0.0.2/go.mod h1:L4KWwDsUJdECRAEpZoBn3O64bQaywRscowZjJAzjHnU= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= diff --git a/go/twitterchiver/default.nix b/go/twitterchiver/default.nix index fae642a4f4..abd8b66406 100644 --- a/go/twitterchiver/default.nix +++ b/go/twitterchiver/default.nix @@ -11,4 +11,23 @@ depot.third_party.gopkgs."github.com".jackc.pgx.v4 ]; }; + + viewer = depot.third_party.buildGo.program { + name = "viewer"; + srcs = [ ./viewer/viewer.go ]; + deps = [ + depot.third_party.gopkgs."github.com".google.safehtml + depot.third_party.gopkgs."github.com".google.safehtml.template + depot.third_party.gopkgs."github.com".google.safehtml.uncheckedconversions + depot.third_party.gopkgs."github.com".gorilla.mux + depot.third_party.gopkgs."github.com".jackc.pgtype + depot.third_party.gopkgs."github.com".jackc.pgx.v4.pgxpool + depot.go.openshiftauth.openshiftauth + ]; + dockerData = [ ( + depot.pkgs.runCommand "source" {} '' + cp -R ${builtins.filterSource (path: type: (type == "directory" && depot.lib.hasSuffix "/templates" path) || (depot.lib.hasInfix "/templates/" path)) ./viewer} $out + '' + ) ]; + }; } diff --git a/go/twitterchiver/viewer/templates/index.html b/go/twitterchiver/viewer/templates/index.html new file mode 100644 index 0000000000..f23b9bb978 --- /dev/null +++ b/go/twitterchiver/viewer/templates/index.html @@ -0,0 +1,8 @@ + +

Twitterchiver

+

Accounts visible to {{.Username}}

+ diff --git a/go/twitterchiver/viewer/templates/tweets.html b/go/twitterchiver/viewer/templates/tweets.html new file mode 100644 index 0000000000..14605628ff --- /dev/null +++ b/go/twitterchiver/viewer/templates/tweets.html @@ -0,0 +1,158 @@ + + +

Twitterchiver: {{.TwitterUsername}}

+{{if .Query}} +

Searching for: {{.Query}}

+{{end}} +
+ +
+
    + {{range .Tweets}} +
  1. +
    + {{$status := .Object}} + {{if .Object.retweeted_status}} + {{$status = .Object.retweeted_status}} + + + {{end}} + + +
    {{call $.FormatTweetText $status.full_text $status}}
    + {{if .Object.extended_entities}} +
    + {{range $entity := .Object.extended_entities.media}} +
    + + + +
    + {{end}} +
    + {{end}} +
    +
  2. + {{end}} +
+ +{{if .NextTweetID}} +...next +{{end}} diff --git a/go/twitterchiver/viewer/viewer.go b/go/twitterchiver/viewer/viewer.go new file mode 100644 index 0000000000..0bf58fc551 --- /dev/null +++ b/go/twitterchiver/viewer/viewer.go @@ -0,0 +1,418 @@ +// SPDX-FileCopyrightText: 2020 Luke Granger-Brown +// +// SPDX-License-IDentifier: Apache-2.0 +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "log" + "net/http" + "sort" + "strconv" + "strings" + "time" + + "github.com/google/safehtml" + "github.com/google/safehtml/template" + "github.com/google/safehtml/uncheckedconversions" + "github.com/gorilla/mux" + "github.com/jackc/pgtype" + "github.com/jackc/pgx/v4/pgxpool" + "hg.lukegb.com/lukegb/depot/go/openshiftauth" +) + +var ( + databaseURL = flag.String("database_url", "", "Database URL") + userMapping = userMap{} + localDisableAuth = flag.String("local_auth_override_user", "", "Disable authn/authz - used for dev - set to username") + + indexTmpl = template.Must(template.ParseFiles("templates/index.html")) + tweetsTmpl = template.Must(template.ParseFiles("templates/tweets.html")) +) + +func init() { + flag.Var(userMapping, "user_to_twitter", "Space-separated list of :") +} + +type userMap map[string][]string + +func (um userMap) String() string { + var bits []string + for u, ts := range um { + ts2 := make([]string, len(ts)) + copy(ts2, ts) + sort.Strings(ts2) + bits = append(bits, fmt.Sprintf("%s:%s", u, strings.Join(ts2, ","))) + } + sort.Strings(bits) + return strings.Join(bits, " ") +} + +func (um userMap) Set(v string) error { + bits := strings.Split(v, " ") + for _, b := range bits { + utsPair := strings.Split(b, ":") + if len(utsPair) != 2 { + return fmt.Errorf("%v is not a : string", b) + } + u := utsPair[0] + ts := utsPair[1] + um[u] = append(um[u], strings.Split(ts, ",")...) + } + return nil +} + +func main() { + flag.Parse() + ctx := context.Background() + + pool, err := pgxpool.Connect(ctx, *databaseURL) + if err != nil { + log.Fatalf("pgxpool.Connect: %v", err) + } + defer pool.Close() + + r := mux.NewRouter() + r.HandleFunc("/healthz", func(rw http.ResponseWriter, r *http.Request) { + rw.Header().Set("Content-Type", "text/plain") + fmt.Fprintf(rw, "ok") + }) + + var authR *mux.Router + if *localDisableAuth != "" { + authR = r + } else { + authR, err = openshiftauth.NewRouter(r) + if err != nil { + log.Fatalf("openshiftauth.NewRouter: %v", err) + } + } + + userFromContext := func(ctx context.Context) string { + if *localDisableAuth != "" { + return *localDisableAuth + } + return openshiftauth.UserFromContext(ctx).Metadata.Name + } + + writeError := func(rw http.ResponseWriter, status int, wrap string, err error) { + log.Printf("Error in HTTP handler: %v: %v", wrap, err) + rw.WriteHeader(status) + fmt.Fprintf(rw, "

Oops. Something went wrong.

") + fmt.Fprintf(rw, "

%s

", wrap) + } + + authR.HandleFunc("/", func(rw http.ResponseWriter, r *http.Request) { + ctx := r.Context() + user := userFromContext(ctx) + twitterAccounts := userMapping[user] + + rows, err := pool.Query(ctx, "SELECT ua.username, COUNT(uat.tweetid) tweet_count, (SELECT CAST(object->>'created_at' AS timestamp with time zone) FROM tweets WHERE id=MAX(uat.tweetid)) latest_tweet FROM user_accounts ua LEFT JOIN user_accounts_tweets uat ON uat.userid=ua.userid WHERE ua.username = ANY($1::text[]) GROUP BY 1 ORDER BY 1", twitterAccounts) + if err != nil { + writeError(rw, http.StatusInternalServerError, "querying database", err) + return + } + defer rows.Close() + + type twitterData struct { + Username string + TweetCount int + LatestTweet time.Time + } + var tds []twitterData + + for rows.Next() { + var td twitterData + if err := rows.Scan(&td.Username, &td.TweetCount, &td.LatestTweet); err != nil { + writeError(rw, http.StatusInternalServerError, "reading from database", err) + return + } + tds = append(tds, td) + } + rows.Close() + + indexTmpl.Execute(rw, struct { + Username string + TwitterAccounts []twitterData + }{ + Username: user, + TwitterAccounts: tds, + }) + }) + isAllowedToSee := func(ctx context.Context, twitterUser string) bool { + twitterAccounts := userMapping[userFromContext(ctx)] + for _, a := range twitterAccounts { + if a == twitterUser { + return true + } + } + return false + } + toInt := func(s string, def int) int { + n, err := strconv.ParseInt(s, 10, 0) + if err != nil { + return def + } + return int(n) + } + clamp := func(min, n, max int) int { + if n < min { + return min + } else if n > max { + return max + } + return n + } + authR.HandleFunc("/view/{twitterUser}", func(rw http.ResponseWriter, r *http.Request) { + ctx := r.Context() + vars := mux.Vars(r) + twitterUser := vars["twitterUser"] + if !isAllowedToSee(ctx, twitterUser) { + writeError(rw, http.StatusNotFound, "no such twitter user being archived", fmt.Errorf("user %q attempted to access %q", userFromContext(ctx), twitterUser)) + return + } + + q := r.URL.Query() + pageSize := clamp(1, toInt(q.Get("page_size"), 20), 200) + startFrom := toInt(q.Get("start_from"), 0) + query := q.Get("q") + + rows, err := pool.Query(ctx, ` +SELECT + t.id, + t.text, + t.object, + CAST(COALESCE(t.object->'retweeted_status'->>'created_at', t.object->>'created_at') AS timestamp with time zone) created_at +FROM + user_accounts_tweets uat +INNER JOIN + user_accounts ua ON ua.userid=uat.userid +INNER JOIN tweets t ON t.id=uat.tweetid +WHERE 1=1 + AND ua.username=$1 + AND ($3::bigint=0 OR t.id <= $3::bigint) + AND ($4='' OR ($4<>'' AND (to_tsvector('english', text) @@ to_tsquery('english', $4) OR to_tsvector('english', object->'retweeted_status'->>'full_text') @@ to_tsquery('english', $4) OR object->'user'->>'screen_name'=$4))) +ORDER BY t.id DESC +LIMIT $2 +`, twitterUser, pageSize+1, startFrom, query) + if err != nil { + writeError(rw, http.StatusInternalServerError, "querying database", err) + return + } + defer rows.Close() + + type tweet struct { + ID int64 + Text string + CreatedAt time.Time + CreatedAtFriendly string + Object interface{} + } + type twitterData struct { + TwitterUsername string + Query string + Tweets []tweet + NextTweetID *int64 + + FormatTweetText func(string, interface{}) safehtml.HTML + } + pullIndices := func(m map[string]interface{}) [2]int { + midx := m["indices"].([]interface{}) + midx0 := int(midx[0].(float64)) + midx1 := int(midx[1].(float64)) + return [2]int{midx0, midx1} + } + td := twitterData{ + TwitterUsername: twitterUser, + Query: query, + FormatTweetText: func(t string, tw interface{}) safehtml.HTML { + ltRep := string([]rune{0xe000}) + gtRep := string([]rune{0xe001}) + t = strings.ReplaceAll(t, "<", ltRep) + t = strings.ReplaceAll(t, ">", gtRep) + + type span struct { + span [2]int + whatDo string // remove, link + + linkTo string // link only + linkText string // link only + } + // Delete native media and add links. + var spans []span + obj := tw.(map[string]interface{}) + if ee, ok := obj["extended_entities"].(map[string]interface{}); ok { + ems := ee["media"].([]interface{}) + for _, emi := range ems { + em := emi.(map[string]interface{}) + spans = append(spans, span{ + span: pullIndices(em), + whatDo: "remove", + }) + } + } + if es, ok := obj["entities"].(map[string]interface{}); ok { + if hts, ok := es["hashtags"].([]interface{}); ok { + for _, hti := range hts { + ht := hti.(map[string]interface{}) + htt := ht["text"].(string) + spans = append(spans, span{ + span: pullIndices(ht), + whatDo: "link", + linkTo: fmt.Sprintf("https://twitter.com/hashtag/%s", htt), + }) + } + } + if urls, ok := es["urls"].([]interface{}); ok { + for _, urli := range urls { + url := urli.(map[string]interface{}) + urldisp := url["display_url"].(string) + urlexp := url["expanded_url"].(string) + spans = append(spans, span{ + span: pullIndices(url), + whatDo: "link", + linkTo: urlexp, + linkText: urldisp, + }) + } + } + if mentions, ok := es["user_mentions"].([]interface{}); ok { + for _, mentioni := range mentions { + mention := mentioni.(map[string]interface{}) + mentionuser := mention["screen_name"].(string) + spans = append(spans, span{ + span: pullIndices(mention), + whatDo: "link", + linkTo: fmt.Sprintf("https://twitter.com/%s", mentionuser), + }) + } + } + if symbols, ok := es["symbols"].([]interface{}); ok { + for _, symboli := range symbols { + symbol := symboli.(map[string]interface{}) + symbolname := symbol["text"].(string) + spans = append(spans, span{ + span: pullIndices(symbol), + whatDo: "link", + linkTo: fmt.Sprintf("?q=$%s", symbolname), + }) + } + } + } + // Sort removeSpans from the end to the beginning. + sort.Slice(spans, func(a, b int) bool { + return spans[a].span[0] > spans[b].span[0] + }) + // Expand overlapping remove spans. + newSpans := make([]span, 0, len(spans)) + for i := 0; i < len(spans)-1; i++ { + span := spans[i] + prevSpan := spans[i+1] + if prevSpan.span[0] <= span.span[0] && prevSpan.span[1] >= span.span[1] { + // Spans overlap. + if span.whatDo != "remove" || prevSpan.whatDo != "remove" { + log.Printf("found overlapping non-remove spans!") + } + if span.span[1] > prevSpan.span[1] { + prevSpan.span[1] = span.span[1] + } + continue + } + newSpans = append(newSpans, span) + } + if len(spans) > 0 { + newSpans = append(newSpans, spans[len(spans)-1]) + } + spans = newSpans + runed := []rune(t) + for _, span := range spans { + switch span.whatDo { + case "remove": + // Delete text from span[0] to span[1]. + runed = append(runed[:span.span[0]], runed[span.span[1]:]...) + case "link": + // Add a link. + var text []rune + if span.linkText == "" { + text = runed[span.span[0]:span.span[1]] + } else { + text = []rune(span.linkText) + } + runedBits := [][]rune{ + runed[:span.span[0]], + []rune(""), + []rune(text), + []rune(""), + runed[span.span[1]:], + } + finalLen := 0 + for _, s := range runedBits { + finalLen += len(s) + } + runed = make([]rune, finalLen) + p := 0 + for _, s := range runedBits { + p += copy(runed[p:], s) + } + default: + log.Printf("unknown span operation %v", span.whatDo) + } + } + t = string(runed) + + // HTML escape any <> + t = strings.ReplaceAll(t, ltRep, "<") + t = strings.ReplaceAll(t, gtRep, ">") + + return uncheckedconversions.HTMLFromStringKnownToSatisfyTypeContract(t) + }, + } + now := time.Now() + for rows.Next() { + var t tweet + var o pgtype.JSONB + if err := rows.Scan(&t.ID, &t.Text, &o, &t.CreatedAt); err != nil { + writeError(rw, http.StatusInternalServerError, "reading from database", err) + return + } + if err := json.Unmarshal(o.Bytes, &t.Object); err != nil { + writeError(rw, http.StatusInternalServerError, "parsing JSON from database", err) + return + } + ago := now.Sub(t.CreatedAt) + switch { + case t.CreatedAt.Year() != now.Year(): + t.CreatedAtFriendly = t.CreatedAt.Format("Jan 2, 2006") + case t.CreatedAt.YearDay() != now.YearDay(): + t.CreatedAtFriendly = t.CreatedAt.Format("Jan 2") + case ago.Hours() >= 1.0: + t.CreatedAtFriendly = fmt.Sprintf("%dh", int(ago.Hours())) + case ago.Minutes() >= 1.0: + t.CreatedAtFriendly = fmt.Sprintf("%dm", int(ago.Minutes())) + case ago.Seconds() >= 0.0: + t.CreatedAtFriendly = fmt.Sprintf("%ds", int(ago.Seconds())) + default: + t.CreatedAtFriendly = fmt.Sprintf("in %ds", -int(ago.Seconds())) + } + td.Tweets = append(td.Tweets, t) + } + rows.Close() + + if len(td.Tweets) > pageSize { + td.NextTweetID = &td.Tweets[pageSize].ID + td.Tweets = td.Tweets[:pageSize] + } + + if err := tweetsTmpl.Execute(rw, td); err != nil { + log.Printf("tweets: executing template: %v", err) + } + }) + + log.Printf("now listening on :8080") + log.Print(http.ListenAndServe(":8080", r)) +} diff --git a/third_party/default.nix b/third_party/default.nix index 96571c7f4f..8f4b8e3765 100644 --- a/third_party/default.nix +++ b/third_party/default.nix @@ -19,14 +19,15 @@ rec { buildGo = let orig = import ./tvl/nix/buildGo { pkgs = nixpkgs; }; in orig // { - program = args: + program = { dockerData ? [], ... }@args: let - origOut = orig.program args; + origOut = orig.program (nixpkgs.lib.filterAttrs (n: v: n != "dockerData") args); in origOut // { dockerImage = nixpkgs.dockerTools.buildImage { name = args.name; + contents = dockerData; config = { - Cmd = [ "${origOut}/bin/${args.name}" ]; + Entrypoint = [ "${origOut}/bin/${args.name}" ]; Env = [ "SSL_CERT_FILE=${nixpkgs.cacert}/etc/ssl/certs/ca-bundle.crt" ];