Implement static serving of compressed files (#387)

This provides an option for #223 without fully resolving it. (I think.)

Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine.

This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well.

The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.)

----

One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.)

----

For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo

I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS.

Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/
And also you can try a few cURL commands:

```shell
curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure
curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip -
curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress -
curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress -
```

Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: ltdk <usr@ltdk.xyz>
Co-committed-by: ltdk <usr@ltdk.xyz>
This commit is contained in:
ltdk 2024-09-29 21:00:54 +00:00 committed by crapStone
parent e5320e1972
commit 5b120f0488
5 changed files with 150 additions and 21 deletions

View file

@ -196,7 +196,7 @@ func (c *AcmeClient) retrieveCertFromDB(sni, mainDomainSuffix string, useDnsProv
// renew certificates 7 days before they expire // renew certificates 7 days before they expire
if tlsCertificate.Leaf.NotAfter.Before(time.Now().Add(7 * 24 * time.Hour)) { if tlsCertificate.Leaf.NotAfter.Before(time.Now().Add(7 * 24 * time.Hour)) {
// TODO: use ValidTill of custom cert struct // TODO: use ValidTill of custom cert struct
if res.CSR != nil && len(res.CSR) > 0 { if len(res.CSR) > 0 {
// CSR stores the time when the renewal shall be tried again // CSR stores the time when the renewal shall be tried again
nextTryUnix, err := strconv.ParseInt(string(res.CSR), 10, 64) nextTryUnix, err := strconv.ParseInt(string(res.CSR), 10, 64)
if err == nil && time.Now().Before(time.Unix(nextTryUnix, 0)) { if err == nil && time.Now().Before(time.Unix(nextTryUnix, 0)) {

View file

@ -37,7 +37,12 @@ type FileResponse struct {
Exists bool Exists bool
IsSymlink bool IsSymlink bool
ETag string ETag string
// uncompressed MIME type
MimeType string MimeType string
// raw MIME type (if compressed, type of compression)
RawMime string
Body []byte Body []byte
} }
@ -45,7 +50,7 @@ func (f FileResponse) IsEmpty() bool {
return len(f.Body) == 0 return len(f.Body) == 0
} }
func (f FileResponse) createHttpResponse(cacheKey string) (header http.Header, statusCode int) { func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (header http.Header, statusCode int) {
header = make(http.Header) header = make(http.Header)
if f.Exists { if f.Exists {
@ -58,7 +63,12 @@ func (f FileResponse) createHttpResponse(cacheKey string) (header http.Header, s
header.Set(giteaObjectTypeHeader, objTypeSymlink) header.Set(giteaObjectTypeHeader, objTypeSymlink)
} }
header.Set(ETagHeader, f.ETag) header.Set(ETagHeader, f.ETag)
if decompress {
header.Set(ContentTypeHeader, f.MimeType) header.Set(ContentTypeHeader, f.MimeType)
} else {
header.Set(ContentTypeHeader, f.RawMime)
}
header.Set(ContentLengthHeader, fmt.Sprintf("%d", len(f.Body))) header.Set(ContentLengthHeader, fmt.Sprintf("%d", len(f.Body)))
header.Set(PagesCacheIndicatorHeader, "true") header.Set(PagesCacheIndicatorHeader, "true")

View file

@ -42,6 +42,7 @@ const (
ETagHeader = "ETag" ETagHeader = "ETag"
ContentTypeHeader = "Content-Type" ContentTypeHeader = "Content-Type"
ContentLengthHeader = "Content-Length" ContentLengthHeader = "Content-Length"
ContentEncodingHeader = "Content-Encoding"
) )
type Client struct { type Client struct {
@ -103,7 +104,7 @@ func (client *Client) ContentWebLink(targetOwner, targetRepo, branch, resource s
} }
func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) { func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) {
reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource) reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource, false)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -111,21 +112,21 @@ func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource str
return io.ReadAll(reader) return io.ReadAll(reader)
} }
func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string) (io.ReadCloser, http.Header, int, error) { func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string, decompress bool) (io.ReadCloser, http.Header, int, error) {
cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource) cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource)
log := log.With().Str("cache_key", cacheKey).Logger() log := log.With().Str("cache_key", cacheKey).Logger()
log.Trace().Msg("try file in cache") log.Trace().Msg("try file in cache")
// handle if cache entry exist // handle if cache entry exist
if cache, ok := client.responseCache.Get(cacheKey); ok { if cache, ok := client.responseCache.Get(cacheKey); ok {
cache := cache.(FileResponse) cache := cache.(FileResponse)
cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey) cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey, decompress)
// TODO: check against some timestamp mismatch?!? // TODO: check against some timestamp mismatch?!?
if cache.Exists { if cache.Exists {
log.Debug().Msg("[cache] exists") log.Debug().Msg("[cache] exists")
if cache.IsSymlink { if cache.IsSymlink {
linkDest := string(cache.Body) linkDest := string(cache.Body)
log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest) log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest) return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
} else if !cache.IsEmpty() { } else if !cache.IsEmpty() {
log.Debug().Msgf("[cache] return %d bytes", len(cache.Body)) log.Debug().Msgf("[cache] return %d bytes", len(cache.Body))
return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil
@ -170,13 +171,17 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str
} }
log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest) log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest) return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
} }
} }
// now we are sure it's content so set the MIME type // now we are sure it's content so set the MIME type
mimeType := client.getMimeTypeByExtension(resource) mimeType, rawType := client.getMimeTypeByExtension(resource)
if decompress {
resp.Response.Header.Set(ContentTypeHeader, mimeType) resp.Response.Header.Set(ContentTypeHeader, mimeType)
} else {
resp.Response.Header.Set(ContentTypeHeader, rawType)
}
if !shouldRespBeSavedToCache(resp.Response) { if !shouldRespBeSavedToCache(resp.Response) {
return reader, resp.Response.Header, resp.StatusCode, err return reader, resp.Response.Header, resp.StatusCode, err
@ -187,6 +192,7 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str
Exists: true, Exists: true,
ETag: resp.Header.Get(ETagHeader), ETag: resp.Header.Get(ETagHeader),
MimeType: mimeType, MimeType: mimeType,
RawMime: rawType,
} }
return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil
@ -300,16 +306,31 @@ func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) {
return false, nil return false, nil
} }
func (client *Client) getMimeTypeByExtension(resource string) string { func (client *Client) extToMime(ext string) string {
mimeType := mime.TypeByExtension(path.Ext(resource)) mimeType := mime.TypeByExtension(ext)
mimeTypeSplit := strings.SplitN(mimeType, ";", 2) mimeTypeSplit := strings.SplitN(mimeType, ";", 2)
if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" { if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" {
mimeType = client.defaultMimeType mimeType = client.defaultMimeType
} }
log.Trace().Msgf("probe mime of %q is %q", resource, mimeType)
return mimeType return mimeType
} }
func (client *Client) getMimeTypeByExtension(resource string) (mimeType, rawType string) {
rawExt := path.Ext(resource)
innerExt := rawExt
switch rawExt {
case ".gz", ".br", ".zst":
innerExt = path.Ext(resource[:len(resource)-len(rawExt)])
}
rawType = client.extToMime(rawExt)
mimeType = rawType
if innerExt != rawExt {
mimeType = client.extToMime(innerExt)
}
log.Trace().Msgf("probe mime of %q is (%q / raw %q)", resource, mimeType, rawType)
return mimeType, rawType
}
func shouldRespBeSavedToCache(resp *http.Response) bool { func shouldRespBeSavedToCache(resp *http.Response) bool {
if resp == nil { if resp == nil {
return false return false

View file

@ -24,5 +24,8 @@ func (o *Options) setHeader(ctx *context.Context, header http.Header) {
} else { } else {
ctx.RespWriter.Header().Set(gitea.ContentTypeHeader, mime) ctx.RespWriter.Header().Set(gitea.ContentTypeHeader, mime)
} }
if encoding := header.Get(gitea.ContentEncodingHeader); encoding != "" && encoding != "identity" {
ctx.RespWriter.Header().Set(gitea.ContentEncodingHeader, encoding)
}
ctx.RespWriter.Header().Set(headerLastModified, o.BranchTimestamp.In(time.UTC).Format(http.TimeFormat)) ctx.RespWriter.Header().Set(headerLastModified, o.BranchTimestamp.In(time.UTC).Format(http.TimeFormat))
} }

View file

@ -1,10 +1,13 @@
package upstream package upstream
import ( import (
"cmp"
"errors" "errors"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"slices"
"strconv"
"strings" "strings"
"time" "time"
@ -19,6 +22,8 @@ import (
const ( const (
headerLastModified = "Last-Modified" headerLastModified = "Last-Modified"
headerIfModifiedSince = "If-Modified-Since" headerIfModifiedSince = "If-Modified-Since"
headerAcceptEncoding = "Accept-Encoding"
headerContentEncoding = "Content-Encoding"
rawMime = "text/plain; charset=utf-8" rawMime = "text/plain; charset=utf-8"
) )
@ -52,6 +57,72 @@ type Options struct {
ServeRaw bool ServeRaw bool
} }
// allowed encodings
var allowedEncodings = map[string]string{
"gzip": ".gz",
"br": ".br",
"zstd": ".zst",
"identity": "",
}
// parses Accept-Encoding header into a list of acceptable encodings
func AcceptEncodings(header string) []string {
log.Trace().Msgf("got accept-encoding: %s", header)
encodings := []string{}
globQuality := 0.0
qualities := make(map[string]float64)
for _, encoding := range strings.Split(header, ",") {
name, quality_str, has_quality := strings.Cut(encoding, ";q=")
quality := 1.0
if has_quality {
var err error
quality, err = strconv.ParseFloat(quality_str, 64)
if err != nil || quality < 0 {
continue
}
}
name = strings.TrimSpace(name)
if name == "*" {
globQuality = quality
} else {
_, allowed := allowedEncodings[name]
if allowed {
qualities[name] = quality
if quality > 0 {
encodings = append(encodings, name)
}
}
}
}
if globQuality > 0 {
for encoding := range allowedEncodings {
_, exists := qualities[encoding]
if !exists {
encodings = append(encodings, encoding)
qualities[encoding] = globQuality
}
}
} else {
_, exists := qualities["identity"]
if !exists {
encodings = append(encodings, "identity")
qualities["identity"] = -1
}
}
slices.SortStableFunc(encodings, func(x, y string) int {
// sort in reverse order; big quality comes first
return cmp.Compare(qualities[y], qualities[x])
})
log.Trace().Msgf("decided encoding order: %v", encodings)
return encodings
}
// Upstream requests a file from the Gitea API at GiteaRoot and writes it to the request context. // Upstream requests a file from the Gitea API at GiteaRoot and writes it to the request context.
func (o *Options) Upstream(ctx *context.Context, giteaClient *gitea.Client, redirectsCache cache.ICache) bool { func (o *Options) Upstream(ctx *context.Context, giteaClient *gitea.Client, redirectsCache cache.ICache) bool {
log := log.With().Strs("upstream", []string{o.TargetOwner, o.TargetRepo, o.TargetBranch, o.TargetPath}).Logger() log := log.With().Strs("upstream", []string{o.TargetOwner, o.TargetRepo, o.TargetBranch, o.TargetPath}).Logger()
@ -97,10 +168,34 @@ func (o *Options) Upstream(ctx *context.Context, giteaClient *gitea.Client, redi
log.Debug().Msg("Preparing") log.Debug().Msg("Preparing")
reader, header, statusCode, err := giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, o.TargetPath) var reader io.ReadCloser
var header http.Header
var statusCode int
var err error
// pick first non-404 response for encoding, *only* if not root
if o.TargetPath == "" || strings.HasSuffix(o.TargetPath, "/") {
err = gitea.ErrorNotFound
} else {
for _, encoding := range AcceptEncodings(ctx.Req.Header.Get(headerAcceptEncoding)) {
log.Trace().Msgf("try %s encoding", encoding)
// add extension for encoding
path := o.TargetPath + allowedEncodings[encoding]
reader, header, statusCode, err = giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, path, true)
if statusCode == 404 {
continue
}
log.Debug().Msgf("using %s encoding", encoding)
if encoding != "identity" {
header.Set(headerContentEncoding, encoding)
}
break
}
if reader != nil { if reader != nil {
defer reader.Close() defer reader.Close()
} }
}
log.Debug().Msg("Aquisting") log.Debug().Msg("Aquisting")