diff --git a/cmd/rio/main.go b/cmd/rio/main.go index 349d490..49681f5 100644 --- a/cmd/rio/main.go +++ b/cmd/rio/main.go @@ -7,13 +7,16 @@ import ( "net" "net/http" "os" + "regexp" "strings" "sync" "time" "git.polynom.me/rio/internal/acme" "git.polynom.me/rio/internal/certificates" + "git.polynom.me/rio/internal/context" "git.polynom.me/rio/internal/dns" + "git.polynom.me/rio/internal/metrics" "git.polynom.me/rio/internal/pages" "git.polynom.me/rio/internal/repo" "git.polynom.me/rio/internal/server" @@ -24,14 +27,14 @@ import ( "github.com/urfave/cli/v2" ) -func handleSubdomain(pagesDomain, domain, cname, path, giteaUrl, defaultCsp string, giteaClient *repo.GiteaClient, lokiConfig *pages.LokiMetricConfig, w http.ResponseWriter) { +func handleSubdomain(ctx *context.GlobalContext, domain, cname, path string, req *http.Request, w http.ResponseWriter) { username := "" if cname != "" { // If we are accessed via a CNAME, then CNAME contains our . value. - username = dns.ExtractUsername(pagesDomain, cname) + username = dns.ExtractUsername(ctx.PagesDomain, cname) } else { // If we are directly accessed, then domain contains our . value. - username = dns.ExtractUsername(pagesDomain, domain) + username = dns.ExtractUsername(ctx.PagesDomain, domain) } // Strip the leading / @@ -52,7 +55,7 @@ func handleSubdomain(pagesDomain, domain, cname, path, giteaUrl, defaultCsp stri domain, cname, path, - giteaClient, + ctx.Gitea, ) if err != nil { log.Errorf("Failed to get repo: %s", err) @@ -60,15 +63,30 @@ func handleSubdomain(pagesDomain, domain, cname, path, giteaUrl, defaultCsp stri return } - pages.ServeFile(username, repo.Name, path, defaultCsp, domain, giteaClient, lokiConfig, w) + d := domain + if cname != "" { + d = cname + } + + c := &context.Context{ + Username: username, + Reponame: repo.Name, + Domain: d, + Path: path, + Referrer: req.Header.Get("Referer"), + UserAgent: req.Header.Get("User-Agent"), + Writer: w, + Global: ctx, + } + pages.ServeFile(c) } -func Handler(pagesDomain, giteaUrl, defaultCsp string, giteaClient *repo.GiteaClient, lokiConfig *pages.LokiMetricConfig) http.HandlerFunc { +func Handler(ctx *context.GlobalContext) http.HandlerFunc { return func(w http.ResponseWriter, req *http.Request) { w.Header().Set("Server", "rio") // Is the direct domain requested? - if req.Host == pagesDomain { + if req.Host == ctx.PagesDomain { log.Debug("Direct pages domain is requested.") // TODO: Handle @@ -77,9 +95,9 @@ func Handler(pagesDomain, giteaUrl, defaultCsp string, giteaClient *repo.GiteaCl } // Is a direct subdomain requested? - if strings.HasSuffix(req.Host, pagesDomain) { + if strings.HasSuffix(req.Host, ctx.PagesDomain) { log.Debug("Domain can be directly handled") - handleSubdomain(pagesDomain, req.Host, "", req.URL.Path, giteaUrl, defaultCsp, giteaClient, lokiConfig, w) + handleSubdomain(ctx, req.Host, "", req.URL.Path, req, w) return } @@ -94,9 +112,9 @@ func Handler(pagesDomain, giteaUrl, defaultCsp string, giteaClient *repo.GiteaCl // Is a direct subdomain requested after CNAME lookup? // NOTE: We now require the leading dot because a CNAME to the direct // pages domain makes no sense. - if strings.HasSuffix(cname, "."+pagesDomain) { + if strings.HasSuffix(cname, "."+ctx.PagesDomain) { log.Debugf("%s is alias of %s and can be handled after a CNAME query", req.Host, cname) - handleSubdomain(pagesDomain, req.Host, cname, req.URL.Path, giteaUrl, defaultCsp, giteaClient, lokiConfig, w) + handleSubdomain(ctx, req.Host, cname, req.URL.Path, req, w) return } @@ -132,6 +150,7 @@ func runServer(ctx *cli.Context) error { acmeDisable := ctx.Bool("acme-disable") defaultCsp := ctx.String("default-csp") lokiUrl := ctx.String("loki-url") + metricsBotList := ctx.String("metrics-bot-list") // Init Logging if ctx.Bool("debug") { @@ -141,15 +160,24 @@ func runServer(ctx *cli.Context) error { } // Set up the Loki metrics - var lokiConfig pages.LokiMetricConfig + var lokiConfig metrics.LokiMetricConfig if lokiUrl == "" { - lokiConfig = pages.LokiMetricConfig{ + lokiConfig = metrics.LokiMetricConfig{ Enabled: false, } } else { - lokiConfig = pages.LokiMetricConfig{ - Enabled: true, - Url: lokiUrl, + var patterns []regexp.Regexp + if metricsBotList != "" { + patterns, _ = metrics.ReadBotPatterns(metricsBotList) + } else { + patterns = make([]regexp.Regexp, 0) + } + log.Infof("Read %d bot patterns from disk", len(patterns)) + + lokiConfig = metrics.LokiMetricConfig{ + Enabled: true, + BotUserAgents: &patterns, + Url: lokiUrl, } } @@ -243,6 +271,13 @@ func runServer(ctx *cli.Context) error { listener = tls.NewListener(listener, tlsConfig) } + globalCtx := &context.GlobalContext{ + DefaultCSP: defaultCsp, + PagesDomain: domain, + Gitea: &giteaClient, + MetricConfig: &lokiConfig, + } + var waitGroup sync.WaitGroup servers := 2 if acmeDisable { @@ -254,7 +289,7 @@ func runServer(ctx *cli.Context) error { defer waitGroup.Done() log.Debug("Listening on main HTTP server") - if err := http.Serve(listener, Handler(domain, giteaUrl, defaultCsp, &giteaClient, &lokiConfig)); err != nil { + if err := http.Serve(listener, Handler(globalCtx)); err != nil { log.Fatal(fmt.Errorf("Listening failed: %v", err)) } log.Debug("Listening on main HTTP server done!") @@ -370,6 +405,12 @@ func main() { Value: "", EnvVars: []string{"LOKI_URL"}, }, + &cli.StringFlag{ + Name: "metrics-bot-list", + Usage: "File to read a list of regular expressions modelling bot user agents from", + Value: "", + EnvVars: []string{"METRICS_BOT_LIST"}, + }, }, } diff --git a/internal/context/context.go b/internal/context/context.go new file mode 100644 index 0000000..1ed1237 --- /dev/null +++ b/internal/context/context.go @@ -0,0 +1,30 @@ +package context + +import ( + "net/http" + + "git.polynom.me/rio/internal/metrics" + "git.polynom.me/rio/internal/repo" +) + +type GlobalContext struct { + DefaultCSP string + PagesDomain string + Gitea *repo.GiteaClient + MetricConfig *metrics.LokiMetricConfig +} + +type Context struct { + Username string + Reponame string + Domain string + Path string + + // HTTP Stuff + Referrer string + UserAgent string + Writer http.ResponseWriter + + // Pointer to the global context + Global *GlobalContext +} diff --git a/internal/pages/metrics.go b/internal/metrics/metrics.go similarity index 52% rename from internal/pages/metrics.go rename to internal/metrics/metrics.go index 370b474..b21ed7d 100644 --- a/internal/pages/metrics.go +++ b/internal/metrics/metrics.go @@ -1,9 +1,11 @@ -package pages +package metrics import ( "encoding/json" + "fmt" "io/ioutil" "net/http" + "regexp" "strconv" "strings" "time" @@ -12,16 +14,29 @@ import ( ) type LokiMetricConfig struct { - Url string - Enabled bool + Url string + BotUserAgents *[]regexp.Regexp + Enabled bool } // Checks if we should send a metric ping to Loki based on the served path. -func (c *LokiMetricConfig) shouldSendMetrics(path string) bool { - return strings.HasSuffix(path, ".html") && c.Enabled +func (c *LokiMetricConfig) ShouldSendMetrics(path, userAgent string) bool { + if !strings.HasSuffix(path, ".html") || !c.Enabled { + return false + } + + // Filter out bots + for _, pattern := range *c.BotUserAgents { + if pattern.MatchString(userAgent) { + return false + } + } + + return true } -func (c *LokiMetricConfig) sendMetricPing(domain, path string) { +func (c *LokiMetricConfig) SendMetricPing(domain, path, referrer string) { + msg := fmt.Sprintf("path=\"%s\" referrer=\"%s\"", path, referrer) data := map[string]interface{}{ "streams": []map[string]interface{}{ { @@ -34,7 +49,7 @@ func (c *LokiMetricConfig) sendMetricPing(domain, path string) { "values": [][]interface{}{ { strconv.Itoa(int(time.Now().UnixNano())), - "path=" + path, + msg, }, }, }, @@ -72,3 +87,30 @@ func (c *LokiMetricConfig) sendMetricPing(domain, path string) { } }() } + +// Reads a JSON array of bot user agents from disk and parses them +// into regular expressions. +func ReadBotPatterns(file string) ([]regexp.Regexp, error) { + content, err := ioutil.ReadFile(file) + if err != nil { + log.Warnf("Failed to read bot metrics file: %v", err) + return []regexp.Regexp{}, err + } + + var payload []string + err = json.Unmarshal(content, &payload) + if err != nil { + log.Warnf("Failed to unmarshal file: %v", err) + return []regexp.Regexp{}, err + } + + patterns := make([]regexp.Regexp, 0) + for _, v := range payload { + patterns = append( + patterns, + *regexp.MustCompile(v), + ) + } + + return patterns, nil +} diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go new file mode 100644 index 0000000..02c64aa --- /dev/null +++ b/internal/metrics/metrics_test.go @@ -0,0 +1,24 @@ +package metrics + +import ( + "regexp" + "testing" +) + +func TestShouldPing(t *testing.T) { + cfg := LokiMetricConfig{ + Enabled: true, + Url: "", + BotUserAgents: &[]regexp.Regexp{ + *regexp.MustCompile("random-bot/.*"), + }, + } + + if cfg.ShouldSendMetrics("/index.html", "random-bot/v23.5") { + t.Fatalf("Accepted bot user-agent") + } + + if !cfg.ShouldSendMetrics("/index.html", "Firefox/...") { + t.Fatalf("Rejected real user-agent") + } +} diff --git a/internal/pages/pages.go b/internal/pages/pages.go index 8e14c57..5dae5a4 100644 --- a/internal/pages/pages.go +++ b/internal/pages/pages.go @@ -8,6 +8,7 @@ import ( "time" "git.polynom.me/rio/internal/constants" + "git.polynom.me/rio/internal/context" "git.polynom.me/rio/internal/repo" "github.com/patrickmn/go-cache" @@ -45,13 +46,14 @@ func addHeaders(csp, contentType string, contentLength int, w http.ResponseWrite } } -func ServeFile(username, reponame, path, defaultCsp, domain string, giteaClient *repo.GiteaClient, metricConfig *LokiMetricConfig, w http.ResponseWriter) { +func ServeFile(context *context.Context) { // Strip away a starting / as it messes with Gitea + path := context.Path if path[:1] == "/" { path = path[1:] } - key := makePageContentCacheEntry(username, path) + key := makePageContentCacheEntry(context.Username, path) entry, found := pageCache.Get(key) var content []byte var mimeType string @@ -65,25 +67,25 @@ func ServeFile(username, reponame, path, defaultCsp, domain string, giteaClient since = &sinceRaw } - content, changed, err := giteaClient.GetFile( - username, - reponame, + content, changed, err := context.Global.Gitea.GetFile( + context.Username, + context.Reponame, constants.PagesBranch, path, since, ) - csp := repo.GetCSPForRepository(username, reponame, "", giteaClient) + csp := repo.GetCSPForRepository(context.Username, context.Reponame, "", context.Global.Gitea) if err != nil { if !found { - log.Errorf("Failed to get file %s/%s/%s (%s)", username, reponame, path, err) - addHeaders(csp, "text/html", 0, w) - w.WriteHeader(404) + log.Errorf("Failed to get file %s/%s/%s (%s)", context.Username, context.Reponame, path, err) + addHeaders(csp, "text/html", 0, context.Writer) + context.Writer.WriteHeader(404) } else { log.Debugf("Request failed but page %s is cached in memory", path) - addHeaders(csp, mimeType, len(content), w) - w.WriteHeader(200) - w.Write(content) + addHeaders(csp, mimeType, len(content), context.Writer) + context.Writer.WriteHeader(200) + context.Writer.Write(content) } return @@ -91,9 +93,9 @@ func ServeFile(username, reponame, path, defaultCsp, domain string, giteaClient if found && !changed { log.Debugf("Page %s is unchanged and cached in memory", path) - addHeaders(csp, mimeType, len(content), w) - w.WriteHeader(200) - w.Write(content) + addHeaders(csp, mimeType, len(content), context.Writer) + context.Writer.WriteHeader(200) + context.Writer.Write(content) return } @@ -113,12 +115,12 @@ func ServeFile(username, reponame, path, defaultCsp, domain string, giteaClient ) log.Debugf("Page %s requested from Gitea and cached in memory at %v", path, now) - addHeaders(csp, mimeType, len(content), w) - w.WriteHeader(200) - w.Write(content) + addHeaders(csp, mimeType, len(content), context.Writer) + context.Writer.WriteHeader(200) + context.Writer.Write(content) // Tell Loki about if, if desired - if metricConfig.shouldSendMetrics(path) { - metricConfig.sendMetricPing(domain, path) + if context.Global.MetricConfig.ShouldSendMetrics(path, context.UserAgent) { + context.Global.MetricConfig.SendMetricPing(context.Domain, path, context.Referrer) } }