pages-server/server/gitea/client.go

352 lines
11 KiB
Go
Raw Normal View History

package gitea
import (
"bytes"
"errors"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"path"
"strconv"
"strings"
"time"
"code.gitea.io/sdk/gitea"
"github.com/rs/zerolog/log"
"codeberg.org/codeberg/pages/config"
"codeberg.org/codeberg/pages/server/cache"
"codeberg.org/codeberg/pages/server/version"
)
var ErrorNotFound = errors.New("not found")
const (
// cache key prefixes
branchTimestampCacheKeyPrefix = "branchTime"
defaultBranchCacheKeyPrefix = "defaultBranch"
rawContentCacheKeyPrefix = "rawContent"
ownerExistenceKeyPrefix = "ownerExist"
// pages server
PagesCacheIndicatorHeader = "X-Pages-Cache"
symlinkReadLimit = 10000
// gitea
giteaObjectTypeHeader = "X-Gitea-Object-Type"
objTypeSymlink = "symlink"
// std
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
ETagHeader = "ETag"
ContentTypeHeader = "Content-Type"
ContentLengthHeader = "Content-Length"
ContentEncodingHeader = "Content-Encoding"
)
type Client struct {
sdkClient *gitea.Client
responseCache cache.ICache
giteaRoot string
followSymlinks bool
supportLFS bool
forbiddenMimeTypes map[string]bool
defaultMimeType string
}
func NewClient(cfg config.ForgeConfig, respCache cache.ICache) (*Client, error) {
// url.Parse returns valid on almost anything...
rootURL, err := url.ParseRequestURI(cfg.Root)
if err != nil {
return nil, fmt.Errorf("invalid forgejo/gitea root url: %w", err)
}
giteaRoot := strings.TrimSuffix(rootURL.String(), "/")
stdClient := http.Client{Timeout: 10 * time.Second}
forbiddenMimeTypes := make(map[string]bool, len(cfg.ForbiddenMimeTypes))
for _, mimeType := range cfg.ForbiddenMimeTypes {
forbiddenMimeTypes[mimeType] = true
}
defaultMimeType := cfg.DefaultMimeType
if defaultMimeType == "" {
defaultMimeType = "application/octet-stream"
}
sdk, err := gitea.NewClient(
giteaRoot,
gitea.SetHTTPClient(&stdClient),
gitea.SetToken(cfg.Token),
gitea.SetUserAgent("pages-server/"+version.Version),
)
return &Client{
sdkClient: sdk,
responseCache: respCache,
giteaRoot: giteaRoot,
followSymlinks: cfg.FollowSymlinks,
supportLFS: cfg.LFSEnabled,
forbiddenMimeTypes: forbiddenMimeTypes,
defaultMimeType: defaultMimeType,
}, err
}
func (client *Client) ContentWebLink(targetOwner, targetRepo, branch, resource string) string {
return path.Join(client.giteaRoot, targetOwner, targetRepo, "src/branch", branch, resource)
}
func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) {
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource, false)
if err != nil {
return nil, err
}
defer reader.Close()
return io.ReadAll(reader)
}
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string, decompress bool) (io.ReadCloser, http.Header, int, error) {
cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource)
log := log.With().Str("cache_key", cacheKey).Logger()
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Trace().Msg("try file in cache")
// handle if cache entry exist
if cache, ok := client.responseCache.Get(cacheKey); ok {
cache := cache.(FileResponse)
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey, decompress)
// TODO: check against some timestamp mismatch?!?
if cache.Exists {
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Debug().Msg("[cache] exists")
if cache.IsSymlink {
linkDest := string(cache.Body)
log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest)
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
} else if !cache.IsEmpty() {
log.Debug().Msgf("[cache] return %d bytes", len(cache.Body))
return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
} else if cache.IsEmpty() {
log.Debug().Msg("[cache] is empty")
}
}
}
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Trace().Msg("file not in cache")
// not in cache, open reader via gitea api
reader, resp, err := client.sdkClient.GetFileReader(targetOwner, targetRepo, ref, resource, client.supportLFS)
if resp != nil {
switch resp.StatusCode {
case http.StatusOK:
// first handle symlinks
{
objType := resp.Header.Get(giteaObjectTypeHeader)
log.Trace().Msgf("server raw content object %q", objType)
if client.followSymlinks && objType == objTypeSymlink {
defer reader.Close()
// read limited chars for symlink
linkDestBytes, err := io.ReadAll(io.LimitReader(reader, symlinkReadLimit))
if err != nil {
return nil, nil, http.StatusInternalServerError, err
}
linkDest := strings.TrimSpace(string(linkDestBytes))
// handle relative links
// we first remove the link from the path, and make a relative join (resolve parent paths like "/../" too)
linkDest = path.Join(path.Dir(resource), linkDest)
// we store symlink not content to reduce duplicates in cache
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
fileResponse := FileResponse{
Exists: true,
IsSymlink: true,
Body: []byte(linkDest),
ETag: resp.Header.Get(ETagHeader),
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
}
log.Trace().Msgf("file response has %d bytes", len(fileResponse.Body))
if err := client.responseCache.Set(cacheKey, fileResponse, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest)
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest, decompress)
}
}
// now we are sure it's content so set the MIME type
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
mimeType, rawType := client.getMimeTypeByExtension(resource)
if decompress {
resp.Response.Header.Set(ContentTypeHeader, mimeType)
} else {
resp.Response.Header.Set(ContentTypeHeader, rawType)
}
if !shouldRespBeSavedToCache(resp.Response) {
return reader, resp.Response.Header, resp.StatusCode, err
}
// now we write to cache and respond at the same time
fileResp := FileResponse{
Exists: true,
ETag: resp.Header.Get(ETagHeader),
MimeType: mimeType,
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
RawMime: rawType,
}
return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil
case http.StatusNotFound:
if err := client.responseCache.Set(cacheKey, FileResponse{
Exists: false,
ETag: resp.Header.Get(ETagHeader),
}, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return nil, resp.Response.Header, http.StatusNotFound, ErrorNotFound
default:
return nil, resp.Response.Header, resp.StatusCode, fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
}
return nil, nil, http.StatusInternalServerError, err
}
func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchName string) (*BranchTimestamp, error) {
cacheKey := fmt.Sprintf("%s/%s/%s/%s", branchTimestampCacheKeyPrefix, repoOwner, repoName, branchName)
if stamp, ok := client.responseCache.Get(cacheKey); ok && stamp != nil {
branchTimeStamp := stamp.(*BranchTimestamp)
if branchTimeStamp.notFound {
log.Trace().Msgf("[cache] use branch %q not found", branchName)
return &BranchTimestamp{}, ErrorNotFound
}
log.Trace().Msgf("[cache] use branch %q exist", branchName)
return branchTimeStamp, nil
}
branch, resp, err := client.sdkClient.GetRepoBranch(repoOwner, repoName, branchName)
if err != nil {
if resp != nil && resp.StatusCode == http.StatusNotFound {
log.Trace().Msgf("[cache] set cache branch %q not found", branchName)
if err := client.responseCache.Set(cacheKey, &BranchTimestamp{Branch: branchName, notFound: true}, branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return &BranchTimestamp{}, ErrorNotFound
}
return &BranchTimestamp{}, err
}
if resp.StatusCode != http.StatusOK {
return &BranchTimestamp{}, fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
stamp := &BranchTimestamp{
Branch: branch.Name,
Timestamp: branch.Commit.Timestamp,
}
log.Trace().Msgf("set cache branch [%s] exist", branchName)
if err := client.responseCache.Set(cacheKey, stamp, branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return stamp, nil
}
func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (string, error) {
cacheKey := fmt.Sprintf("%s/%s/%s", defaultBranchCacheKeyPrefix, repoOwner, repoName)
if branch, ok := client.responseCache.Get(cacheKey); ok && branch != nil {
return branch.(string), nil
}
repo, resp, err := client.sdkClient.GetRepo(repoOwner, repoName)
if err != nil {
return "", err
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
branch := repo.DefaultBranch
if err := client.responseCache.Set(cacheKey, branch, defaultBranchCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return branch, nil
}
func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) {
cacheKey := fmt.Sprintf("%s/%s", ownerExistenceKeyPrefix, owner)
if exist, ok := client.responseCache.Get(cacheKey); ok && exist != nil {
return exist.(bool), nil
}
_, resp, err := client.sdkClient.GetUserInfo(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
} else if resp.StatusCode != http.StatusNotFound {
return false, err
}
_, resp, err = client.sdkClient.GetOrg(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, true, ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
} else if resp.StatusCode != http.StatusNotFound {
return false, err
}
if err := client.responseCache.Set(cacheKey, false, ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return false, nil
}
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
func (client *Client) extToMime(ext string) string {
mimeType := mime.TypeByExtension(ext)
mimeTypeSplit := strings.SplitN(mimeType, ";", 2)
if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" {
mimeType = client.defaultMimeType
}
return mimeType
}
Implement static serving of compressed files (#387) This provides an option for #223 without fully resolving it. (I think.) Essentially, it acts very similar to the `gzip_static` and similar options for nginx, where it will check for the existence of pre-compressed files and serve those instead if the client allows it. I couldn't find a pre-existing way to actually parse the Accept-Encoding header properly (admittedly didn't look very hard) and just implemented one on my own that should be fine. This should hopefully not have the same DOS vulnerabilities as #302, since it relies on the existing caching system. Compressed versions of files will be cached just like any other files, and that includes cache for missing files as well. The compressed files will also be accessible directly, and this won't automatically decompress them. So, if you have a `tar.gz` file that you access directly, it will still be downloaded as the gzipped version, although you will now gain the option to download the `.tar` directly and decompress it in transit. (Which doesn't affect the server at all, just the client's way of interpreting it.) ---- One key thing this change also adds is a short-circuit when accessing directories: these always return 404 via the API, although they'd try the cache anyway and go through that route, which was kind of slow. Adding in the additional encodings, it's going to try for .gz, .br, and .zst files in the worst case as well, which feels wrong. So, instead, it just always falls back to the index-check behaviour if the path ends in a slash or is empty. (Which is implicitly just a slash.) ---- For testing, I set up this repo: https://codeberg.org/clarfonthey/testrepo I ended up realising that LFS wasn't supported by default with `just dev`, so, it ended up working until I made sure the files on the repo *didn't* use LFS. Assuming you've run `just dev`, you can go directly to this page in the browser here: https://clarfonthey.localhost.mock.directory:4430/testrepo/ And also you can try a few cURL commands: ```shell curl https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure curl -H 'Accept-Encoding: gz' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | gunzip - curl -H 'Accept-Encoding: br' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | brotli --decompress - curl -H 'Accept-Encoding: zst' https://clarfonthey.localhost.mock.directory:4430/testrepo/ --verbose --insecure | zstd --decompress - ``` Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/387 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: ltdk <usr@ltdk.xyz> Co-committed-by: ltdk <usr@ltdk.xyz>
2024-09-29 21:00:54 +00:00
func (client *Client) getMimeTypeByExtension(resource string) (mimeType, rawType string) {
rawExt := path.Ext(resource)
innerExt := rawExt
switch rawExt {
case ".gz", ".br", ".zst":
innerExt = path.Ext(resource[:len(resource)-len(rawExt)])
}
rawType = client.extToMime(rawExt)
mimeType = rawType
if innerExt != rawExt {
mimeType = client.extToMime(innerExt)
}
log.Trace().Msgf("probe mime of %q is (%q / raw %q)", resource, mimeType, rawType)
return mimeType, rawType
}
func shouldRespBeSavedToCache(resp *http.Response) bool {
if resp == nil {
return false
}
contentLengthRaw := resp.Header.Get(ContentLengthHeader)
if contentLengthRaw == "" {
return false
}
contentLength, err := strconv.ParseInt(contentLengthRaw, 10, 64)
if err != nil {
log.Error().Err(err).Msg("could not parse content length")
}
// if content to big or could not be determined we not cache it
return contentLength > 0 && contentLength < fileCacheSizeLimit
}