pages-server/server/gitea/client.go

317 lines
11 KiB
Go
Raw Permalink Normal View History

package gitea
import (
"bytes"
"errors"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"path"
"strings"
"time"
"code.gitea.io/sdk/gitea"
"github.com/rs/zerolog/log"
"codeberg.org/codeberg/pages/config"
"codeberg.org/codeberg/pages/server/cache"
"codeberg.org/codeberg/pages/server/version"
)
var ErrorNotFound = errors.New("not found")
const (
// cache key prefixes
branchTimestampCacheKeyPrefix = "branchTime"
defaultBranchCacheKeyPrefix = "defaultBranch"
rawContentCacheKeyPrefix = "rawContent"
ownerExistenceKeyPrefix = "ownerExist"
// pages server
PagesCacheIndicatorHeader = "X-Pages-Cache"
symlinkReadLimit = 10000
// gitea
giteaObjectTypeHeader = "X-Gitea-Object-Type"
objTypeSymlink = "symlink"
// std
ETagHeader = "ETag"
ContentTypeHeader = "Content-Type"
ContentLengthHeader = "Content-Length"
)
type Client struct {
sdkClient *gitea.Client
responseCache cache.ICache
giteaRoot string
followSymlinks bool
supportLFS bool
forbiddenMimeTypes map[string]bool
defaultMimeType string
}
func NewClient(cfg config.GiteaConfig, respCache cache.ICache) (*Client, error) {
rootURL, err := url.Parse(cfg.Root)
if err != nil {
return nil, err
}
giteaRoot := strings.Trim(rootURL.String(), "/")
stdClient := http.Client{Timeout: 10 * time.Second}
forbiddenMimeTypes := make(map[string]bool, len(cfg.ForbiddenMimeTypes))
for _, mimeType := range cfg.ForbiddenMimeTypes {
forbiddenMimeTypes[mimeType] = true
}
defaultMimeType := cfg.DefaultMimeType
if defaultMimeType == "" {
defaultMimeType = "application/octet-stream"
}
sdk, err := gitea.NewClient(
giteaRoot,
gitea.SetHTTPClient(&stdClient),
gitea.SetToken(cfg.Token),
gitea.SetUserAgent("pages-server/"+version.Version),
)
return &Client{
sdkClient: sdk,
responseCache: respCache,
giteaRoot: giteaRoot,
followSymlinks: cfg.FollowSymlinks,
supportLFS: cfg.LFSEnabled,
forbiddenMimeTypes: forbiddenMimeTypes,
defaultMimeType: defaultMimeType,
}, err
}
func (client *Client) ContentWebLink(targetOwner, targetRepo, branch, resource string) string {
return path.Join(client.giteaRoot, targetOwner, targetRepo, "src/branch", branch, resource)
}
func (client *Client) GiteaRawContent(targetOwner, targetRepo, ref, resource string) ([]byte, error) {
reader, _, _, err := client.ServeRawContent(targetOwner, targetRepo, ref, resource)
if err != nil {
return nil, err
}
defer reader.Close()
return io.ReadAll(reader)
}
func (client *Client) ServeRawContent(targetOwner, targetRepo, ref, resource string) (io.ReadCloser, http.Header, int, error) {
cacheKey := fmt.Sprintf("%s/%s/%s|%s|%s", rawContentCacheKeyPrefix, targetOwner, targetRepo, ref, resource)
log := log.With().Str("cache_key", cacheKey).Logger()
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Trace().Msg("try file in cache")
// handle if cache entry exist
if cacheMetadata, ok := client.responseCache.Get(cacheKey + "|Metadata"); ok {
cache := FileResponseFromMetadataString(string(cacheMetadata))
cache.Body, _ = client.responseCache.Get(cacheKey + "|Body")
// TODO: don't grab the content from the cache if the ETag matches?!
cachedHeader, cachedStatusCode := cache.createHttpResponse(cacheKey)
// TODO: check against some timestamp mismatch?!?
if cache.Exists {
if cache.IsSymlink {
linkDest := string(cache.Body)
log.Debug().Msgf("[cache] follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest)
} else {
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Debug().Msgf("[cache] return %d bytes", len(cache.Body))
return io.NopCloser(bytes.NewReader(cache.Body)), cachedHeader, cachedStatusCode, nil
}
} else {
return nil, nil, http.StatusNotFound, ErrorNotFound
}
}
// TODO: metadata not written, is close ever called?
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
log.Trace().Msg("file not in cache")
// not in cache, open reader via gitea api
reader, resp, err := client.sdkClient.GetFileReader(targetOwner, targetRepo, ref, resource, client.supportLFS)
if resp != nil {
switch resp.StatusCode {
case http.StatusOK:
// first handle symlinks
{
objType := resp.Header.Get(giteaObjectTypeHeader)
log.Trace().Msgf("server raw content object %q", objType)
if client.followSymlinks && objType == objTypeSymlink {
defer reader.Close()
// read limited chars for symlink
linkDestBytes, err := io.ReadAll(io.LimitReader(reader, symlinkReadLimit))
if err != nil {
return nil, nil, http.StatusInternalServerError, err
}
linkDest := strings.TrimSpace(string(linkDestBytes))
// handle relative links
// we first remove the link from the path, and make a relative join (resolve parent paths like "/../" too)
linkDest = path.Join(path.Dir(resource), linkDest)
// we store symlink not content to reduce duplicates in cache
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
fileResponse := FileResponse{
Exists: true,
IsSymlink: true,
Body: []byte(linkDest),
ETag: resp.Header.Get(ETagHeader),
FIX blank internal pages (#164) (#292) Hello 👋 since it affected my deployment of the pages server I started to look into the problem of the blank pages and think I found a solution for it: 1. There is no check if the file response is empty, neither in cache retrieval nor in writing of a cache. Also the provided method for checking for empty responses had a bug. 2. I identified the redirect response to be the issue here. There is a cache write with the full cache key (e. g. rawContent/user/repo|branch|route/index.html) happening in the handling of the redirect response. But the written body here is empty. In the triggered request from the redirect response the server then finds a cache item to the key and serves the empty body. A quick fix is the check for empty file responses mentioned in 1. 3. The decision to redirect the user comes quite far down in the upstream function. Before that happens a lot of stuff that may not be important since after the redirect response comes a new request anyway. Also, I suspect that this causes the caching problem because there is a request to the forge server and its error handling with some recursions happening before. I propose to move two of the redirects before "Preparing" 4. The recursion in the upstream function makes it difficult to understand what is actually happening. I added some more logging to have an easier time with that. 5. I changed the default behaviour to append a trailing slash to the path to true. In my tested scenarios it happened anyway. This way there is no recursion happening before the redirect. I am not developing in go frequently and rarely contribute to open source -> so feedback of all kind is appreciated closes #164 Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/292 Reviewed-by: 6543 <6543@obermui.de> Reviewed-by: crapStone <codeberg@crapstone.dev> Co-authored-by: Hoernschen <julian.hoernschemeyer@mailbox.org> Co-committed-by: Hoernschen <julian.hoernschemeyer@mailbox.org>
2024-02-26 22:21:42 +00:00
}
log.Trace().Msgf("file response has %d bytes", len(fileResponse.Body))
if err := client.responseCache.Set(cacheKey+"|Metadata", []byte(fileResponse.MetadataAsString()), fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
// TODO: Test with binary files, as we convert []byte to string! Using []byte values might makes more sense anyways.
if err := client.responseCache.Set(cacheKey+"|Body", fileResponse.Body, fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
log.Debug().Msgf("follow symlink from %q to %q", resource, linkDest)
return client.ServeRawContent(targetOwner, targetRepo, ref, linkDest)
}
}
// now we are sure it's content so set the MIME type
mimeType := client.getMimeTypeByExtension(resource)
resp.Response.Header.Set(ContentTypeHeader, mimeType)
// now we write to cache and respond at the same time
fileResp := FileResponse{
Exists: true,
ETag: resp.Header.Get(ETagHeader),
MimeType: mimeType,
}
return fileResp.CreateCacheReader(reader, client.responseCache, cacheKey), resp.Response.Header, resp.StatusCode, nil
case http.StatusNotFound:
if err := client.responseCache.Set(cacheKey+"|Metadata", []byte(FileResponse{ETag: resp.Header.Get(ETagHeader)}.MetadataAsString()), fileCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return nil, resp.Response.Header, http.StatusNotFound, ErrorNotFound
default:
return nil, resp.Response.Header, resp.StatusCode, fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
}
return nil, nil, http.StatusInternalServerError, err
}
func (client *Client) GiteaGetRepoBranchTimestamp(repoOwner, repoName, branchName string) (*BranchTimestamp, error) {
cacheKey := fmt.Sprintf("%s/%s/%s/%s", branchTimestampCacheKeyPrefix, repoOwner, repoName, branchName)
if stamp, ok := client.responseCache.Get(cacheKey); ok {
if len(stamp) == 0 {
log.Trace().Msgf("[cache] use branch %q not found", branchName)
return &BranchTimestamp{}, ErrorNotFound
}
log.Trace().Msgf("[cache] use branch %q exist", branchName)
// This comes from the refactoring of the caching library.
// The branch as reported by the API was stored in the cache, and I'm not sure if there are
// situations where it differs from the name in the request, hence this is left here.
stampParts := strings.SplitN(string(stamp), "|", 2)
stampTime, _ := time.Parse(time.RFC3339, stampParts[0])
return &BranchTimestamp{
Branch: stampParts[1],
Timestamp: stampTime,
}, nil
}
branch, resp, err := client.sdkClient.GetRepoBranch(repoOwner, repoName, branchName)
if err != nil {
if resp != nil && resp.StatusCode == http.StatusNotFound {
log.Trace().Msgf("[cache] set cache branch %q not found", branchName)
if err := client.responseCache.Set(cacheKey, []byte{}, branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return &BranchTimestamp{}, ErrorNotFound
}
return &BranchTimestamp{}, err
}
if resp.StatusCode != http.StatusOK {
return &BranchTimestamp{}, fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
stamp := &BranchTimestamp{
Branch: branch.Name,
Timestamp: branch.Commit.Timestamp,
}
log.Trace().Msgf("set cache branch [%s] exist", branchName)
if err := client.responseCache.Set(cacheKey, []byte(stamp.Timestamp.Format(time.RFC3339)+"|"+stamp.Branch), branchExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return stamp, nil
}
func (client *Client) GiteaGetRepoDefaultBranch(repoOwner, repoName string) (string, error) {
cacheKey := fmt.Sprintf("%s/%s/%s", defaultBranchCacheKeyPrefix, repoOwner, repoName)
if branch, ok := client.responseCache.Get(cacheKey); ok {
return string(branch), nil
}
repo, resp, err := client.sdkClient.GetRepo(repoOwner, repoName)
if err != nil {
return "", err
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code '%d'", resp.StatusCode)
}
branch := repo.DefaultBranch
if err := client.responseCache.Set(cacheKey, []byte(branch), defaultBranchCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return branch, nil
}
func (client *Client) GiteaCheckIfOwnerExists(owner string) (bool, error) {
cacheKey := fmt.Sprintf("%s/%s", ownerExistenceKeyPrefix, owner)
if exist, ok := client.responseCache.Get(cacheKey); ok && exist != nil {
return string(exist) == "true", nil
}
_, resp, err := client.sdkClient.GetUserInfo(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
} else if resp.StatusCode != http.StatusNotFound {
return false, err
}
_, resp, err = client.sdkClient.GetOrg(owner)
if resp.StatusCode == http.StatusOK && err == nil {
if err := client.responseCache.Set(cacheKey, []byte("true"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return true, nil
} else if resp.StatusCode != http.StatusNotFound {
return false, err
}
if err := client.responseCache.Set(cacheKey, []byte("false"), ownerExistenceCacheTimeout); err != nil {
log.Error().Err(err).Msg("[cache] error on cache write")
}
return false, nil
}
func (client *Client) getMimeTypeByExtension(resource string) string {
mimeType := mime.TypeByExtension(path.Ext(resource))
mimeTypeSplit := strings.SplitN(mimeType, ";", 2)
if client.forbiddenMimeTypes[mimeTypeSplit[0]] || mimeType == "" {
mimeType = client.defaultMimeType
}
log.Trace().Msgf("probe mime of %q is %q", resource, mimeType)
return mimeType
}