cache pages (#403)

taken from https://codeberg.org/Codeberg/pages-server/pulls/301

Co-authored-by: Moritz Marquardt <git@momar.de>

Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/403
Co-authored-by: crapStone <me@crapstone.dev>
Co-committed-by: crapStone <me@crapstone.dev>
This commit is contained in:
crapStone 2024-11-21 23:26:30 +00:00 committed by crapStone
parent 23a8e83e80
commit 85059aa46e
4 changed files with 120 additions and 117 deletions

View file

@ -2,6 +2,7 @@ package gitea
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
@ -34,23 +35,18 @@ const (
)
type FileResponse struct {
Exists bool
IsSymlink bool
ETag string
// uncompressed MIME type
MimeType string
// raw MIME type (if compressed, type of compression)
RawMime string
Body []byte
Exists bool `json:"exists"`
IsSymlink bool `json:"isSymlink"`
ETag string `json:"eTag"`
MimeType string `json:"mimeType"`
Body []byte `json:"-"` // saved separately
}
func (f FileResponse) IsEmpty() bool {
return len(f.Body) == 0
}
func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (header http.Header, statusCode int) {
func (f FileResponse) createHttpResponse(cacheKey string) (header http.Header, statusCode int) {
header = make(http.Header)
if f.Exists {
@ -63,12 +59,7 @@ func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (head
header.Set(giteaObjectTypeHeader, objTypeSymlink)
}
header.Set(ETagHeader, f.ETag)
if decompress {
header.Set(ContentTypeHeader, f.MimeType)
} else {
header.Set(ContentTypeHeader, f.RawMime)
}
header.Set(ContentTypeHeader, f.MimeType)
header.Set(ContentLengthHeader, fmt.Sprintf("%d", len(f.Body)))
header.Set(PagesCacheIndicatorHeader, "true")
@ -77,9 +68,9 @@ func (f FileResponse) createHttpResponse(cacheKey string, decompress bool) (head
}
type BranchTimestamp struct {
Branch string
Timestamp time.Time
notFound bool
NotFound bool `json:"notFound"`
Branch string `json:"branch,omitempty"`
Timestamp time.Time `json:"timestamp,omitempty"`
}
type writeCacheReader struct {
@ -89,32 +80,46 @@ type writeCacheReader struct {
cacheKey string
cache cache.ICache
hasError bool
doNotCache bool
complete bool
}
func (t *writeCacheReader) Read(p []byte) (n int, err error) {
log.Trace().Msgf("[cache] read %q", t.cacheKey)
n, err = t.originalReader.Read(p)
if err == io.EOF {
t.complete = true
}
if err != nil && err != io.EOF {
log.Trace().Err(err).Msgf("[cache] original reader for %q has returned an error", t.cacheKey)
t.hasError = true
} else if n > 0 {
_, _ = t.buffer.Write(p[:n])
if t.buffer.Len()+n > int(fileCacheSizeLimit) {
t.doNotCache = true
t.buffer.Reset()
} else {
_, _ = t.buffer.Write(p[:n])
}
}
return
}
func (t *writeCacheReader) Close() error {
doWrite := !t.hasError
doWrite := !t.hasError && !t.doNotCache && t.complete
fc := *t.fileResponse
fc.Body = t.buffer.Bytes()
if fc.IsEmpty() {
log.Trace().Msg("[cache] file response is empty")
doWrite = false
}
if doWrite {
err := t.cache.Set(t.cacheKey, fc, fileCacheTimeout)
jsonToCache, err := json.Marshal(fc)
if err != nil {
log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey)
log.Trace().Err(err).Msgf("[cache] marshaling json for %q has returned an error", t.cacheKey+"|Metadata")
}
err = t.cache.Set(t.cacheKey+"|Metadata", jsonToCache, fileCacheTimeout)
if err != nil {
log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey+"|Metadata")
}
err = t.cache.Set(t.cacheKey+"|Body", fc.Body, fileCacheTimeout)
if err != nil {
log.Trace().Err(err).Msgf("[cache] writer for %q has returned an error", t.cacheKey+"|Body")
}
}
log.Trace().Msgf("cacheReader for %q saved=%t closed", t.cacheKey, doWrite)

View file

@ -2,6 +2,7 @@ package gitea
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
@ -39,10 +40,9 @@ const (
objTypeSymlink = "symlink"
// std
ETagHeader = "ETag"
ContentTypeHeader = "Content-Type"
ContentLengthHeader = "Content-Length"
ContentEncodingHeader = "Content-Encoding"
ETagHeader = "ETag"
ContentTypeHeader = "Content-Type"
ContentLengthHeader = "Content-Length"
)
type Client struct {
@ -104,7 +104,7 @@ func (client *Client) ContentWebLink(targetOwner, targetRepo, branch, resource s
}
func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) {
reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource, false)
reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource)
if err != nil {
return nil, err
}
@ -112,27 +112,42 @@ func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource str
return io.ReadAll(reader)
}
func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string, decompress bool) (io.ReadCloser, http.Header, int, error) {
func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string) (io.ReadCloser, http.Header, int, error) {
cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource)
log := log.With().Str("cache_key", cacheKey).Logger()
log.Trace().Msg("try file in cache")
// handle if cache entry exist
if cache, ok := client.responseCache.Get(cacheKey); ok {
cache := cache.(FileResponse)
cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey, decompress)
// TODO: check against some timestamp mismatch?!?
if cacheMetadata, ok := client.responseCache.Get(cacheKey + "|Metadata"); ok {
var cache FileResponse
err := json.Unmarshal(cacheMetadata.([]byte), &cache)
if err != nil {
log.Error().Err(err).Msgf("[cache] failed to unmarshal metadata for: %s", cacheKey)
return nil, nil, http.StatusNotFound, err
}
if !cache.Exists {
return nil, nil, http.StatusNotFound, ErrorNotFound
}
body, ok := client.responseCache.Get(cacheKey + "|Body")
if !ok {
log.Error().Msgf("[cache] failed to get body for: %s", cacheKey)
return nil, nil, http.StatusNotFound, ErrorNotFound
}
cache.Body = body.([]byte)
cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey)
if cache.Exists {
log.Debug().Msg("[cache] exists")
if cache.IsSymlink {
linkDest := string(cache.Body)
log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
} else if !cache.IsEmpty() {
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest)
} else {
log.Debug().Msgf("[cache] return %d bytes", len(cache.Body))
return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil
} else if cache.IsEmpty() {
log.Debug().Msg("[cache] is empty")
}
} else {
return nil, nil, http.StatusNotFound, ErrorNotFound
}
}
log.Trace().Msg("file not in cache")
@ -166,41 +181,40 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str
ETag: resp.Header.Get(ETagHeader),
}
log.Trace().Msgf("file response has %d bytes", len(fileResponse.Body))
if err := client.responseCache.Set(cacheKey, fileResponse, fileCacheTimeout); err != nil {
jsonToCache, err := json.Marshal(fileResponse)
if err != nil {
log.Error().Err(err).Msgf("[cache] marshaling json metadata for %q has returned an error", cacheKey)
}
if err := client.responseCache.Set(cacheKey+"|Metadata", jsonToCache, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
if err := client.responseCache.Set(cacheKey+"|Body", fileResponse.Body, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest)
}
}
// now we are sure it's content so set the MIME type
mimeType, rawType := client.getMimeTypeByExtension(resource)
if decompress {
resp.Response.Header.Set(ContentTypeHeader, mimeType)
} else {
resp.Response.Header.Set(ContentTypeHeader, rawType)
}
if !shouldRespBeSavedToCache(resp.Response) {
return reader, resp.Response.Header, resp.StatusCode, err
}
mimeType := client.getMimeTypeByExtension(resource)
resp.Response.Header.Set(ContentTypeHeader, mimeType)
// now we write to cache and respond at the same time
fileResp := FileResponse{
Exists: true,
ETag: resp.Header.Get(ETagHeader),
MimeType: mimeType,
RawMime: rawType,
}
return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil
case http.StatusNotFound:
if err := client.responseCache.Set(cacheKey, FileResponse{
Exists: false,
ETag: resp.Header.Get(ETagHeader),
}, fileCacheTimeout); err != nil {
jsonToCache, err := json.Marshal(FileResponse{ETag: resp.Header.Get(ETagHeader)})
if err != nil {
log.Error().Err(err).Msgf("[cache] marshaling json metadata for %q has returned an error", cacheKey)
}
if err := client.responseCache.Set(cacheKey+"|Metadata", jsonToCache, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
@ -215,21 +229,36 @@ func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource str
func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchName string) (*BranchTimestamp, error) {
cacheKey := fmt.Sprintf("%s/%s/%s/%s", branchTimestampCacheKeyPrefix, repoOwner, repoName, branchName)
if stamp, ok := client.responseCache.Get(cacheKey); ok && stamp != nil {
branchTimeStamp := stamp.(*BranchTimestamp)
if branchTimeStamp.notFound {
log.Trace().Msgf("[cache] use branch %q not found", branchName)
if stampRaw, ok := client.responseCache.Get(cacheKey); ok {
var stamp BranchTimestamp
err := json.Unmarshal(stampRaw.([]byte), &stamp)
if err != nil {
log.Error().Err(err).Bytes("stamp", stampRaw.([]byte)).Msgf("[cache] failed to unmarshal timestamp for: %s", cacheKey)
return &BranchTimestamp{}, ErrorNotFound
}
log.Trace().Msgf("[cache] use branch %q exist", branchName)
return branchTimeStamp, nil
if stamp.NotFound {
log.Trace().Msgf("[cache] branch %q does not exist", branchName)
return &BranchTimestamp{}, ErrorNotFound
} else {
log.Trace().Msgf("[cache] use branch %q exist", branchName)
// This comes from the refactoring of the caching library.
// The branch as reported by the API was stored in the cache, and I'm not sure if there are
// situations where it differs from the name in the request, hence this is left here.
return &stamp, nil
}
}
branch, resp, err := client.sdkClient.GetRepoBranch(repoOwner, repoName, branchName)
if err != nil {
if resp != nil && resp.StatusCode == http.StatusNotFound {
log.Trace().Msgf("[cache] set cache branch %q not found", branchName)
if err := client.responseCache.Set(cacheKey, &BranchTimestamp{Branch: branchName, notFound: true}, branchExistenceCacheTimeout); err != nil {
jsonToCache, err := json.Marshal(BranchTimestamp{NotFound: true})
if err != nil {
log.Error().Err(err).Msgf("[cache] marshaling empty timestamp for '%s' has returned an error", cacheKey)
}
if err := client.responseCache.Set(cacheKey, jsonToCache, branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return &BranchTimestamp{}, ErrorNotFound
@ -246,7 +275,11 @@ func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchNam
}
log.Trace().Msgf("set cache branch [%s] exist", branchName)
if err := client.responseCache.Set(cacheKey, stamp, branchExistenceCacheTimeout); err != nil {
jsonToCache, err := json.Marshal(stamp)
if err != nil {
log.Error().Err(err).Msgf("[cache] marshaling timestamp for %q has returned an error", cacheKey)
}
if err := client.responseCache.Set(cacheKey, jsonToCache, branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return stamp, nil
@ -255,8 +288,8 @@ func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchNam
func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (string, error) {
cacheKey := fmt.Sprintf("%s/%s/%s", defaultBranchCacheKeyPrefix, repoOwner, repoName)
if branch, ok := client.responseCache.Get(cacheKey); ok && branch != nil {
return branch.(string), nil
if branch, ok := client.responseCache.Get(cacheKey); ok {
return string(branch.([]byte)), nil
}
repo, resp, err := client.sdkClient.GetRepo(repoOwner, repoName)
@ -268,7 +301,7 @@ func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (str
}
branch := repo.DefaultBranch
if err := client.responseCache.Set(cacheKey, branch, defaultBranchCacheTimeout); err != nil {
if err := client.responseCache.Set(cacheKey, []byte(branch), defaultBranchCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return branch, nil
@ -277,13 +310,14 @@ func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (str
func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) {
cacheKey := fmt.Sprintf("%s/%s", ownerExistenceKeyPrefix, owner)
if exist, ok := client.responseCache.Get(cacheKey); ok && exist != nil {
return exist.(bool), nil
if existRaw, ok := client.responseCache.Get(cacheKey); ok && existRaw != nil {
exist, err := strconv.ParseBool(existRaw.(string))
return exist, err
}
_, resp, err := client.sdkClient.GetUserInfo(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil {
if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
@ -293,59 +327,26 @@ func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) {
_, resp, err = client.sdkClient.GetOrg(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil {
if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
} else if resp.StatusCode != http.StatusNotFound {
return false, err
}
if err := client.responseCache.Set(cacheKey, false, ownerExistenceCacheTimeout); err != nil {
if err := client.responseCache.Set(cacheKey, []byte("false"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return false, nil
}
func (client *Client) extToMime(ext string) string {
mimeType := mime.TypeByExtension(ext)
func (client *Client) getMimeTypeByExtension(resource string) string {
mimeType := mime.TypeByExtension(path.Ext(resource))
mimeTypeSplit := strings.SplitN(mimeType, ";", 2)
if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" {
mimeType = client.defaultMimeType
}
log.Trace().Msgf("probe mime of %q is %q", resource, mimeType)
return mimeType
}
func (client *Client) getMimeTypeByExtension(resource string) (mimeType, rawType string) {
rawExt := path.Ext(resource)
innerExt := rawExt
switch rawExt {
case ".gz", ".br", ".zst":
innerExt = path.Ext(resource[:len(resource)-len(rawExt)])
}
rawType = client.extToMime(rawExt)
mimeType = rawType
if innerExt != rawExt {
mimeType = client.extToMime(innerExt)
}
log.Trace().Msgf("probe mime of %q is (%q / raw %q)", resource, mimeType, rawType)
return mimeType, rawType
}
func shouldRespBeSavedToCache(resp *http.Response) bool {
if resp == nil {
return false
}
contentLengthRaw := resp.Header.Get(ContentLengthHeader)
if contentLengthRaw == "" {
return false
}
contentLength, err := strconv.ParseInt(contentLengthRaw, 10, 64)
if err != nil {
log.Error().Err(err).Msg("could not parse content length")
}
// if content to big or could not be determined we not cache it
return contentLength > 0 && contentLength < fileCacheSizeLimit
}

View file

@ -24,8 +24,5 @@ func (o *Options) setHeader(ctx *context.Context, header http.Header) {
} else {
ctx.RespWriter.Header().Set(gitea.ContentTypeHeader, mime)
}
if encoding := header.Get(gitea.ContentEncodingHeader); encoding != "" && encoding != "identity" {
ctx.RespWriter.Header().Set(gitea.ContentEncodingHeader, encoding)
}
ctx.RespWriter.Header().Set(headerLastModified, o.BranchTimestamp.In(time.UTC).Format(http.TimeFormat))
}

View file

@ -182,7 +182,7 @@ func (o *Options) Upstream(ctx *context.Context, giteaClient *gitea.Client, redi
// add extension for encoding
path := o.TargetPath + allowedEncodings[encoding]
reader, header, statusCode, err = giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, path, true)
reader, header, statusCode, err = giteaClient.ServeRawContent(o.TargetOwner, o.TargetRepo, o.TargetBranch, path)
if statusCode == 404 {
continue
}