pages-server/server/certificates/certificates.go

409 lines
14 KiB
Go
Raw Normal View History

2021-12-05 14:21:05 +00:00
package certificates
2021-03-16 23:34:31 +00:00
import (
2021-12-05 17:26:54 +00:00
"context"
2021-03-16 23:34:31 +00:00
"crypto/tls"
"crypto/x509"
"errors"
2021-12-05 22:20:34 +00:00
"fmt"
"strconv"
"strings"
"time"
2024-04-18 17:08:47 +00:00
"github.com/hashicorp/golang-lru/v2"
"github.com/go-acme/lego/v4/certcrypto"
2021-12-03 02:44:21 +00:00
"github.com/go-acme/lego/v4/certificate"
"github.com/go-acme/lego/v4/challenge/tlsalpn01"
"github.com/go-acme/lego/v4/lego"
2021-12-05 15:33:56 +00:00
"github.com/reugn/equalizer"
"github.com/rs/zerolog/log"
2021-12-03 02:44:21 +00:00
2021-12-03 03:15:48 +00:00
"codeberg.org/codeberg/pages/server/cache"
2021-12-03 02:44:21 +00:00
"codeberg.org/codeberg/pages/server/database"
2021-12-05 14:21:05 +00:00
dnsutils "codeberg.org/codeberg/pages/server/dns"
"codeberg.org/codeberg/pages/server/gitea"
2021-12-05 14:21:05 +00:00
"codeberg.org/codeberg/pages/server/upstream"
2021-03-16 23:34:31 +00:00
)
var ErrUserRateLimitExceeded = errors.New("rate limit exceeded: 10 certificates per user per 24 hours")
var keyCache *lru.Cache[string, tls.Certificate]
2021-12-03 03:15:48 +00:00
// TLSConfig returns the configuration for generating, serving and cleaning up Let's Encrypt certificates.
func TLSConfig(mainDomainSuffix string,
giteaClient *gitea.Client,
acmeClient *AcmeClient,
firstDefaultBranch string,
2024-04-18 17:08:47 +00:00
challengeCache, canonicalDomainCache cache.ICache,
certDB database.CertDB,
noDNS01 bool,
rawDomain string,
) *tls.Config {
2021-12-05 13:45:17 +00:00
return &tls.Config{
// check DNS name & get certificate from Let's Encrypt
GetCertificate: func(info *tls.ClientHelloInfo) (*tls.Certificate, error) {
domain := strings.ToLower(strings.TrimSpace(info.ServerName))
if len(domain) < 1 {
return nil, errors.New("missing domain info via SNI (RFC 4366, Section 3.1)")
2021-12-05 13:45:17 +00:00
}
// https request init is actually a acme challenge
2021-12-05 13:45:17 +00:00
if info.SupportedProtos != nil {
for _, proto := range info.SupportedProtos {
if proto != tlsalpn01.ACMETLS1Protocol {
continue
}
log.Info().Msgf("Detect ACME-TLS1 challenge for '%s'", domain)
challenge, ok := challengeCache.Get(domain)
if !ok {
return nil, errors.New("no challenge for this domain")
}
cert, err := tlsalpn01.ChallengeCert(domain, string(challenge))
if err != nil {
return nil, err
}
return cert, nil
}
}
2021-12-05 13:45:17 +00:00
targetOwner := ""
Allow to use certificate even if domain validation fails (#160) - Currently if the canonical domain validations fails(either for legitimate reasons or for bug reasons like the request to Gitea/Forgejo failing) it will use main domain certificate, which in the case for custom domains will warrant a security error as the certificate isn't issued to the custom domain. - This patch handles this situation more gracefully and instead only disallow obtaining a certificate if the domain validation fails, so in the case that a certificate still exists it can still be used even if the canonical domain validation fails. There's a small side effect, legitimate users that remove domains from `.domain` will still be able to use the removed domain(as long as the DNS records exists) as long as the certificate currently hold by pages-server isn't expired. - Given the increased usage in custom domains that are resulting in errors, I think it ways more than the side effect. - In order to future-proof against future slowdowns of instances, add a retry mechanism to the domain validation function, such that it's more likely to succeed even if the instance is not responding. - Refactor the code a bit and add some comments. Co-authored-by: Gusted <postmaster@gusted.xyz> Co-authored-by: 6543 <6543@obermui.de> Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/160 Reviewed-by: 6543 <6543@obermui.de> Co-authored-by: Gusted <gusted@noreply.codeberg.org> Co-committed-by: Gusted <gusted@noreply.codeberg.org>
2023-02-10 01:38:15 +00:00
mayObtainCert := true
if strings.HasSuffix(domain, mainDomainSuffix) || strings.EqualFold(domain, mainDomainSuffix[1:]) {
if noDNS01 {
// Limit the domains allowed to request a certificate to pages-server domains
// and domains for an existing user of org
if !strings.EqualFold(domain, mainDomainSuffix[1:]) && !strings.EqualFold(domain, rawDomain) {
targetOwner := strings.TrimSuffix(domain, mainDomainSuffix)
owner_exist, err := giteaClient.GiteaCheckIfOwnerExists(targetOwner)
mayObtainCert = owner_exist
if err != nil {
log.Error().Err(err).Msgf("Failed to check '%s' existence on the forge: %s", targetOwner, err)
mayObtainCert = false
}
}
} else {
// deliver default certificate for the main domain (*.codeberg.page)
domain = mainDomainSuffix
}
} else {
2021-12-05 13:45:17 +00:00
var targetRepo, targetBranch string
targetOwner, targetRepo, targetBranch = dnsutils.GetTargetFromDNS(domain, mainDomainSuffix, firstDefaultBranch)
2021-12-05 13:45:17 +00:00
if targetOwner == "" {
// DNS not set up, return main certificate to redirect to the docs
domain = mainDomainSuffix
2021-12-05 13:45:17 +00:00
} else {
targetOpt := &upstream.Options{
TargetOwner: targetOwner,
TargetRepo: targetRepo,
TargetBranch: targetBranch,
}
_, valid := targetOpt.CheckCanonicalDomain(giteaClient, domain, mainDomainSuffix, canonicalDomainCache)
2021-12-05 13:45:17 +00:00
if !valid {
Allow to use certificate even if domain validation fails (#160) - Currently if the canonical domain validations fails(either for legitimate reasons or for bug reasons like the request to Gitea/Forgejo failing) it will use main domain certificate, which in the case for custom domains will warrant a security error as the certificate isn't issued to the custom domain. - This patch handles this situation more gracefully and instead only disallow obtaining a certificate if the domain validation fails, so in the case that a certificate still exists it can still be used even if the canonical domain validation fails. There's a small side effect, legitimate users that remove domains from `.domain` will still be able to use the removed domain(as long as the DNS records exists) as long as the certificate currently hold by pages-server isn't expired. - Given the increased usage in custom domains that are resulting in errors, I think it ways more than the side effect. - In order to future-proof against future slowdowns of instances, add a retry mechanism to the domain validation function, such that it's more likely to succeed even if the instance is not responding. - Refactor the code a bit and add some comments. Co-authored-by: Gusted <postmaster@gusted.xyz> Co-authored-by: 6543 <6543@obermui.de> Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/160 Reviewed-by: 6543 <6543@obermui.de> Co-authored-by: Gusted <gusted@noreply.codeberg.org> Co-committed-by: Gusted <gusted@noreply.codeberg.org>
2023-02-10 01:38:15 +00:00
// We shouldn't obtain a certificate when we cannot check if the
// repository has specified this domain in the `.domains` file.
mayObtainCert = false
2021-12-05 13:45:17 +00:00
}
}
}
if keyCache == nil {
var err error
keyCache, err = lru.New[string, tls.Certificate](4096)
if err != nil {
panic(err) // This should only happen if 4096 < 0 at the time of writing, which should be reason enough to panic.
}
}
if tlsCertificate, ok := keyCache.Get(domain); ok {
2021-12-05 13:45:17 +00:00
// we can use an existing certificate object
return &tlsCertificate, nil
2021-12-05 13:45:17 +00:00
}
var tlsCertificate *tls.Certificate
2021-12-05 13:45:17 +00:00
var err error
if tlsCertificate, err = acmeClient.retrieveCertFromDB(domain, mainDomainSuffix, false, certDB); err != nil {
if !errors.Is(err, database.ErrNotFound) {
return nil, err
2021-12-05 13:45:17 +00:00
}
// we could not find a cert in db, request a new certificate
// first check if we are allowed to obtain a cert for this domain
if strings.EqualFold(domain, mainDomainSuffix) {
return nil, errors.New("won't request certificate for main domain, something really bad has happened")
}
Allow to use certificate even if domain validation fails (#160) - Currently if the canonical domain validations fails(either for legitimate reasons or for bug reasons like the request to Gitea/Forgejo failing) it will use main domain certificate, which in the case for custom domains will warrant a security error as the certificate isn't issued to the custom domain. - This patch handles this situation more gracefully and instead only disallow obtaining a certificate if the domain validation fails, so in the case that a certificate still exists it can still be used even if the canonical domain validation fails. There's a small side effect, legitimate users that remove domains from `.domain` will still be able to use the removed domain(as long as the DNS records exists) as long as the certificate currently hold by pages-server isn't expired. - Given the increased usage in custom domains that are resulting in errors, I think it ways more than the side effect. - In order to future-proof against future slowdowns of instances, add a retry mechanism to the domain validation function, such that it's more likely to succeed even if the instance is not responding. - Refactor the code a bit and add some comments. Co-authored-by: Gusted <postmaster@gusted.xyz> Co-authored-by: 6543 <6543@obermui.de> Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/160 Reviewed-by: 6543 <6543@obermui.de> Co-authored-by: Gusted <gusted@noreply.codeberg.org> Co-committed-by: Gusted <gusted@noreply.codeberg.org>
2023-02-10 01:38:15 +00:00
if !mayObtainCert {
return nil, fmt.Errorf("won't request certificate for %q", domain)
Allow to use certificate even if domain validation fails (#160) - Currently if the canonical domain validations fails(either for legitimate reasons or for bug reasons like the request to Gitea/Forgejo failing) it will use main domain certificate, which in the case for custom domains will warrant a security error as the certificate isn't issued to the custom domain. - This patch handles this situation more gracefully and instead only disallow obtaining a certificate if the domain validation fails, so in the case that a certificate still exists it can still be used even if the canonical domain validation fails. There's a small side effect, legitimate users that remove domains from `.domain` will still be able to use the removed domain(as long as the DNS records exists) as long as the certificate currently hold by pages-server isn't expired. - Given the increased usage in custom domains that are resulting in errors, I think it ways more than the side effect. - In order to future-proof against future slowdowns of instances, add a retry mechanism to the domain validation function, such that it's more likely to succeed even if the instance is not responding. - Refactor the code a bit and add some comments. Co-authored-by: Gusted <postmaster@gusted.xyz> Co-authored-by: 6543 <6543@obermui.de> Reviewed-on: https://codeberg.org/Codeberg/pages-server/pulls/160 Reviewed-by: 6543 <6543@obermui.de> Co-authored-by: Gusted <gusted@noreply.codeberg.org> Co-committed-by: Gusted <gusted@noreply.codeberg.org>
2023-02-10 01:38:15 +00:00
}
tlsCertificate, err = acmeClient.obtainCert(acmeClient.legoClient, []string{domain}, nil, targetOwner, false, mainDomainSuffix, certDB)
2021-12-05 13:45:17 +00:00
if err != nil {
return nil, err
}
}
keyCache.Add(domain, *tlsCertificate)
return tlsCertificate, nil
2021-12-05 13:45:17 +00:00
},
NextProtos: []string{
"h2",
2021-12-05 13:45:17 +00:00
"http/1.1",
tlsalpn01.ACMETLS1Protocol,
},
2021-12-05 13:45:17 +00:00
// generated 2021-07-13, Mozilla Guideline v5.6, Go 1.14.4, intermediate configuration
// https://ssl-config.mozilla.org/#server=go&version=1.14.4&config=intermediate&guideline=5.6
MinVersion: tls.VersionTLS12,
CipherSuites: []uint16{
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,
},
}
}
func (c *AcmeClient) checkUserLimit(user string) error {
userLimit, ok := c.acmeClientCertificateLimitPerUser[user]
if !ok {
// Each user can only add 10 new domains per day.
2021-11-25 15:12:28 +00:00
userLimit = equalizer.NewTokenBucket(10, time.Hour*24)
c.acmeClientCertificateLimitPerUser[user] = userLimit
}
if !userLimit.Ask() {
return fmt.Errorf("user '%s' error: %w", user, ErrUserRateLimitExceeded)
}
return nil
}
func (c *AcmeClient) retrieveCertFromDB(sni, mainDomainSuffix string, useDnsProvider bool, certDB database.CertDB) (*tls.Certificate, error) {
// parse certificate from database
res, err := certDB.Get(sni)
2021-12-05 18:00:57 +00:00
if err != nil {
return nil, err
} else if res == nil {
return nil, database.ErrNotFound
}
tlsCertificate, err := tls.X509KeyPair(res.Certificate, res.PrivateKey)
if err != nil {
return nil, err
}
2021-12-02 18:12:45 +00:00
// TODO: document & put into own function
if !strings.EqualFold(sni, mainDomainSuffix) {
tlsCertificate.Leaf, err = x509.ParseCertificate(tlsCertificate.Certificate[0])
if err != nil {
return nil, fmt.Errorf("error parsing leaf tlsCert: %w", err)
}
// renew certificates 7 days before they expire
if tlsCertificate.Leaf.NotAfter.Before(time.Now().Add(7 * 24 * time.Hour)) {
// TODO: use ValidTill of custom cert struct
if res.CSR != nil && len(res.CSR) > 0 {
// CSR stores the time when the renewal shall be tried again
nextTryUnix, err := strconv.ParseInt(string(res.CSR), 10, 64)
if err == nil && time.Now().Before(time.Unix(nextTryUnix, 0)) {
return &tlsCertificate, nil
}
}
// TODO: make a queue ?
go (func() {
res.CSR = nil // acme client doesn't like CSR to be set
if _, err := c.obtainCert(c.legoClient, []string{sni}, res, "", useDnsProvider, mainDomainSuffix, certDB); err != nil {
log.Error().Msgf("Couldn't renew certificate for %s: %v", sni, err)
}
})()
}
}
return &tlsCertificate, nil
}
func (c *AcmeClient) obtainCert(acmeClient *lego.Client, domains []string, renew *certificate.Resource, user string, useDnsProvider bool, mainDomainSuffix string, keyDatabase database.CertDB) (*tls.Certificate, error) {
name := strings.TrimPrefix(domains[0], "*")
// lock to avoid simultaneous requests
_, working := c.obtainLocks.LoadOrStore(name, struct{}{})
if working {
for working {
time.Sleep(100 * time.Millisecond)
_, working = c.obtainLocks.Load(name)
}
cert, err := c.retrieveCertFromDB(name, mainDomainSuffix, useDnsProvider, keyDatabase)
if err != nil {
return nil, fmt.Errorf("certificate failed in synchronous request: %w", err)
}
return cert, nil
}
defer c.obtainLocks.Delete(name)
if acmeClient == nil {
if useDnsProvider {
return mockCert(domains[0], "DNS ACME client is not defined", mainDomainSuffix, keyDatabase)
} else {
return mockCert(domains[0], "ACME client uninitialized. This is a server error, please report!", mainDomainSuffix, keyDatabase)
}
}
// request actual cert
var res *certificate.Resource
var err error
if renew != nil && renew.CertURL != "" {
if c.acmeUseRateLimits {
c.acmeClientRequestLimit.Take()
}
log.Debug().Msgf("Renewing certificate for: %v", domains)
res, err = acmeClient.Certificate.Renew(*renew, true, false, "")
if err != nil {
log.Error().Err(err).Msgf("Couldn't renew certificate for %v, trying to request a new one", domains)
if c.acmeUseRateLimits {
c.acmeClientFailLimit.Take()
}
res = nil
}
}
if res == nil {
if user != "" {
if err := c.checkUserLimit(user); err != nil {
return nil, err
}
}
if c.acmeUseRateLimits {
c.acmeClientOrderLimit.Take()
c.acmeClientRequestLimit.Take()
}
log.Debug().Msgf("Re-requesting new certificate for %v", domains)
res, err = acmeClient.Certificate.Obtain(certificate.ObtainRequest{
Domains: domains,
Bundle: true,
MustStaple: false,
})
if c.acmeUseRateLimits && err != nil {
c.acmeClientFailLimit.Take()
}
}
if err != nil {
log.Error().Err(err).Msgf("Couldn't obtain again a certificate or %v", domains)
if renew != nil && renew.CertURL != "" {
tlsCertificate, err := tls.X509KeyPair(renew.Certificate, renew.PrivateKey)
if err != nil {
mockC, err2 := mockCert(domains[0], err.Error(), mainDomainSuffix, keyDatabase)
if err2 != nil {
return nil, errors.Join(err, err2)
}
return mockC, err
}
leaf, err := leaf(&tlsCertificate)
if err == nil && leaf.NotAfter.After(time.Now()) {
// avoid sending a mock cert instead of a still valid cert, instead abuse CSR field to store time to try again at
2021-12-01 21:59:52 +00:00
renew.CSR = []byte(strconv.FormatInt(time.Now().Add(6*time.Hour).Unix(), 10))
2021-12-05 18:00:57 +00:00
if err := keyDatabase.Put(name, renew); err != nil {
mockC, err2 := mockCert(domains[0], err.Error(), mainDomainSuffix, keyDatabase)
if err2 != nil {
return nil, errors.Join(err, err2)
}
return mockC, err
2021-12-05 18:00:57 +00:00
}
return &tlsCertificate, nil
}
}
return mockCert(domains[0], err.Error(), mainDomainSuffix, keyDatabase)
}
log.Debug().Msgf("Obtained certificate for %v", domains)
2021-12-05 18:00:57 +00:00
if err := keyDatabase.Put(name, res); err != nil {
return nil, err
2021-12-05 18:00:57 +00:00
}
tlsCertificate, err := tls.X509KeyPair(res.Certificate, res.PrivateKey)
if err != nil {
return nil, err
}
return &tlsCertificate, nil
}
func SetupMainDomainCertificates(mainDomainSuffix string, acmeClient *AcmeClient, certDB database.CertDB) error {
2021-12-05 22:20:34 +00:00
// getting main cert before ACME account so that we can fail here without hitting rate limits
mainCertBytes, err := certDB.Get(mainDomainSuffix)
if err != nil && !errors.Is(err, database.ErrNotFound) {
return fmt.Errorf("cert database is not working: %w", err)
}
if mainCertBytes == nil {
_, err = acmeClient.obtainCert(acmeClient.dnsChallengerLegoClient, []string{"*" + mainDomainSuffix, mainDomainSuffix[1:]}, nil, "", true, mainDomainSuffix, certDB)
if err != nil {
log.Error().Err(err).Msg("Couldn't renew main domain certificate, continuing with mock certs only")
}
}
2021-12-05 22:20:34 +00:00
return nil
}
func MaintainCertDB(ctx context.Context, interval time.Duration, acmeClient *AcmeClient, mainDomainSuffix string, certDB database.CertDB) {
for {
// delete expired certs that will be invalid until next clean up
threshold := time.Now().Add(interval)
expiredCertCount := 0
certs, err := certDB.Items(0, 0)
if err != nil {
log.Error().Err(err).Msg("could not get certs from list")
} else {
for _, cert := range certs {
if !strings.EqualFold(cert.Domain, strings.TrimPrefix(mainDomainSuffix, ".")) {
if time.Unix(cert.ValidTill, 0).Before(threshold) {
err := certDB.Delete(cert.Domain)
if err != nil {
log.Error().Err(err).Msgf("Deleting expired certificate for %q failed", cert.Domain)
} else {
expiredCertCount++
}
}
}
}
log.Debug().Msgf("Removed %d expired certificates from the database", expiredCertCount)
}
// update main cert
res, err := certDB.Get(mainDomainSuffix)
2021-12-05 18:00:57 +00:00
if err != nil {
log.Error().Msgf("Couldn't get cert for domain %q", mainDomainSuffix)
2021-12-05 18:00:57 +00:00
} else if res == nil {
log.Error().Msgf("Couldn't renew certificate for main domain %q expected main domain cert to exist, but it's missing - seems like the database is corrupted", mainDomainSuffix)
} else {
tlsCertificates, err := certcrypto.ParsePEMBundle(res.Certificate)
if err != nil {
log.Error().Err(fmt.Errorf("could not parse cert for mainDomainSuffix: %w", err))
} else if tlsCertificates[0].NotAfter.Before(time.Now().Add(30 * 24 * time.Hour)) {
// renew main certificate 30 days before it expires
go (func() {
_, err = acmeClient.obtainCert(acmeClient.dnsChallengerLegoClient, []string{"*" + mainDomainSuffix, mainDomainSuffix[1:]}, res, "", true, mainDomainSuffix, certDB)
if err != nil {
log.Error().Err(err).Msg("Couldn't renew certificate for main domain")
}
})()
}
}
2021-12-05 17:26:54 +00:00
select {
case <-ctx.Done():
return
case <-time.After(interval):
}
}
}
// leaf returns the parsed leaf certificate, either from c.leaf or by parsing
// the corresponding c.Certificate[0].
func leaf(c *tls.Certificate) (*x509.Certificate, error) {
if c.Leaf != nil {
return c.Leaf, nil
}
return x509.ParseCertificate(c.Certificate[0])
}