Skip to content

[server] Add health check HTTP endpoint for Relay server #4297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions relay/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"

Expand All @@ -17,8 +18,9 @@ import (
"github.com/spf13/cobra"

"github.com/netbirdio/netbird/encryption"
"github.com/netbirdio/netbird/shared/relay/auth"
"github.com/netbirdio/netbird/relay/healthcheck"
"github.com/netbirdio/netbird/relay/server"
"github.com/netbirdio/netbird/shared/relay/auth"
"github.com/netbirdio/netbird/signal/metrics"
"github.com/netbirdio/netbird/util"
)
Expand All @@ -34,12 +36,13 @@ type Config struct {
LetsencryptDomains []string
// in case of using Route 53 for DNS challenge the credentials should be provided in the environment variables or
// in the AWS credentials file
LetsencryptAWSRoute53 bool
TlsCertFile string
TlsKeyFile string
AuthSecret string
LogLevel string
LogFile string
LetsencryptAWSRoute53 bool
TlsCertFile string
TlsKeyFile string
AuthSecret string
LogLevel string
LogFile string
HealthcheckListenAddress string
}

func (c Config) Validate() error {
Expand Down Expand Up @@ -87,6 +90,7 @@ func init() {
rootCmd.PersistentFlags().StringVarP(&cobraConfig.AuthSecret, "auth-secret", "s", "", "auth secret")
rootCmd.PersistentFlags().StringVar(&cobraConfig.LogLevel, "log-level", "info", "log level")
rootCmd.PersistentFlags().StringVar(&cobraConfig.LogFile, "log-file", "console", "log file")
rootCmd.PersistentFlags().StringVarP(&cobraConfig.HealthcheckListenAddress, "health-listen-address", "H", ":9000", "listen address of healthcheck server")

setFlagsFromEnvVars(rootCmd)
}
Expand All @@ -102,6 +106,7 @@ func waitForExitSignal() {
}

func execute(cmd *cobra.Command, args []string) error {
wg := sync.WaitGroup{}
err := cobraConfig.Validate()
if err != nil {
log.Debugf("invalid config: %s", err)
Expand All @@ -120,7 +125,9 @@ func execute(cmd *cobra.Command, args []string) error {
return fmt.Errorf("setup metrics: %v", err)
}

wg.Add(1)
go func() {
defer wg.Done()
log.Infof("running metrics server: %s%s", metricsServer.Addr, metricsServer.Endpoint)
if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start metrics server: %v", err)
Expand Down Expand Up @@ -154,19 +161,42 @@ func execute(cmd *cobra.Command, args []string) error {
return fmt.Errorf("failed to create relay server: %v", err)
}
log.Infof("server will be available on: %s", srv.InstanceURL())
wg.Add(1)
go func() {
defer wg.Done()
if err := srv.Listen(srvListenerCfg); err != nil {
log.Fatalf("failed to bind server: %s", err)
}
}()

hCfg := healthcheck.Config{
ListenAddress: cobraConfig.HealthcheckListenAddress,
ServiceChecker: srv,
}
httpHealthcheck, err := healthcheck.NewServer(hCfg)
if err != nil {
log.Debugf("failed to create healthcheck server: %v", err)
return fmt.Errorf("failed to create healthcheck server: %v", err)
}
wg.Add(1)
go func() {
defer wg.Done()
if err := httpHealthcheck.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start healthcheck server: %v", err)
}
}()

// it will block until exit signal
waitForExitSignal()

ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

var shutDownErrors error
if err := httpHealthcheck.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close healthcheck server: %v", err))
}

if err := srv.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close server: %s", err))
}
Expand All @@ -175,6 +205,8 @@ func execute(cmd *cobra.Command, args []string) error {
if err := metricsServer.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close metrics server: %v", err))
}

wg.Wait()
return shutDownErrors
}

Expand Down
195 changes: 195 additions & 0 deletions relay/healthcheck/healthcheck.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package healthcheck

import (
"context"
"encoding/json"
"errors"
"net"
"net/http"
"sync"
"time"

log "github.com/sirupsen/logrus"

"github.com/netbirdio/netbird/relay/protocol"
"github.com/netbirdio/netbird/relay/server/listener/quic"
"github.com/netbirdio/netbird/relay/server/listener/ws"
)

const (
statusHealthy = "healthy"
statusUnhealthy = "unhealthy"

path = "/health"

cacheTTL = 3 * time.Second // Cache TTL for health status
)

type ServiceChecker interface {
ListenerProtocols() []protocol.Protocol
ListenAddress() string
}

type HealthStatus struct {
Status string `json:"status"`
Timestamp time.Time `json:"timestamp"`
Listeners []protocol.Protocol `json:"listeners"`
CertificateValid bool `json:"certificate_valid"`
}

type Config struct {
ListenAddress string
ServiceChecker ServiceChecker
}

type Server struct {
config Config
httpServer *http.Server

cacheMu sync.Mutex
cacheStatus *HealthStatus
}

func NewServer(config Config) (*Server, error) {
mux := http.NewServeMux()

if config.ServiceChecker == nil {
return nil, errors.New("service checker is required")
}

server := &Server{
config: config,
httpServer: &http.Server{
Addr: config.ListenAddress,
Handler: mux,
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
IdleTimeout: 15 * time.Second,
},
}

mux.HandleFunc(path, server.handleHealthcheck)
return server, nil
}

func (s *Server) ListenAndServe() error {
log.Infof("starting healthcheck server on: http://%s%s", dialAddress(s.config.ListenAddress), path)
return s.httpServer.ListenAndServe()
}

// Shutdown gracefully shuts down the healthcheck server
func (s *Server) Shutdown(ctx context.Context) error {
log.Info("Shutting down healthcheck server")
return s.httpServer.Shutdown(ctx)
}

func (s *Server) handleHealthcheck(w http.ResponseWriter, _ *http.Request) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()

var (
status *HealthStatus
ok bool
)
// Cache check
s.cacheMu.Lock()
status = s.cacheStatus
s.cacheMu.Unlock()

if status != nil && time.Since(status.Timestamp) <= cacheTTL {
ok = status.Status == statusHealthy
} else {
status, ok = s.getHealthStatus(ctx)
// Update cache
s.cacheMu.Lock()
s.cacheStatus = status
s.cacheMu.Unlock()
}

w.Header().Set("Content-Type", "application/json")

if ok {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}

encoder := json.NewEncoder(w)
if err := encoder.Encode(status); err != nil {
log.Errorf("Failed to encode healthcheck response: %v", err)
}
}

func (s *Server) getHealthStatus(ctx context.Context) (*HealthStatus, bool) {
healthy := true
status := &HealthStatus{
Timestamp: time.Now(),
Status: statusHealthy,
CertificateValid: true,
}

listeners, ok := s.validateListeners()
if !ok {
status.Status = statusUnhealthy
healthy = false
}
status.Listeners = listeners

if ok := s.validateCertificate(ctx); !ok {
status.Status = statusUnhealthy
status.CertificateValid = false
healthy = false
}

return status, healthy
}

func (s *Server) validateListeners() ([]protocol.Protocol, bool) {
listeners := s.config.ServiceChecker.ListenerProtocols()
if len(listeners) == 0 {
return nil, false
}
return listeners, true
}

func (s *Server) validateCertificate(ctx context.Context) bool {
listenAddress := s.config.ServiceChecker.ListenAddress()
if listenAddress == "" {
log.Warn("listen address is empty")
return false
}

dAddr := dialAddress(listenAddress)

for _, proto := range s.config.ServiceChecker.ListenerProtocols() {
switch proto {
case ws.Proto:
if err := dialWS(ctx, dAddr); err != nil {
log.Errorf("failed to dial WebSocket listener: %v", err)
return false
}
case quic.Proto:
if err := dialQUIC(ctx, dAddr); err != nil {
log.Errorf("failed to dial QUIC listener: %v", err)
return false
}
default:
log.Warnf("unknown protocol for healthcheck: %s", proto)
return false
}
}
return true
}

func dialAddress(listenAddress string) string {
host, port, err := net.SplitHostPort(listenAddress)
if err != nil {
return listenAddress // fallback, might be invalid for dialing
}

if host == "" || host == "::" || host == "0.0.0.0" {
host = "0.0.0.0"
}

return net.JoinHostPort(host, port)
}
31 changes: 31 additions & 0 deletions relay/healthcheck/quic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package healthcheck

import (
"context"
"crypto/tls"
"fmt"
"time"

"github.com/quic-go/quic-go"

tlsnb "github.com/netbirdio/netbird/shared/relay/tls"
)

func dialQUIC(ctx context.Context, address string) error {
tlsConfig := &tls.Config{
InsecureSkipVerify: false, // Keep certificate validation enabled
NextProtos: []string{tlsnb.NBalpn},
}

conn, err := quic.DialAddr(ctx, address, tlsConfig, &quic.Config{
MaxIdleTimeout: 30 * time.Second,
KeepAlivePeriod: 10 * time.Second,
EnableDatagrams: true,
})
if err != nil {
return fmt.Errorf("failed to connect to QUIC server: %w", err)
}

_ = conn.CloseWithError(0, "availability check complete")
return nil
}
28 changes: 28 additions & 0 deletions relay/healthcheck/ws.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package healthcheck

import (
"context"
"fmt"

"github.com/coder/websocket"

"github.com/netbirdio/netbird/shared/relay"
)

func dialWS(ctx context.Context, address string) error {
url := fmt.Sprintf("wss://%s%s", address, relay.WebSocketURLPath)

conn, resp, err := websocket.Dial(ctx, url, nil)
if resp != nil {
defer func() {
_ = resp.Body.Close()
}()

Copy link
Preview

Copilot AI Aug 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The empty line after the anonymous function declaration creates unnecessary whitespace. Remove the blank line for cleaner code formatting.

Suggested change

Copilot uses AI. Check for mistakes.

}
if err != nil {
return fmt.Errorf("failed to connect to websocket: %w", err)
}

_ = conn.Close(websocket.StatusNormalClosure, "availability check complete")
return nil
}
3 changes: 3 additions & 0 deletions relay/protocol/protocol.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package protocol

type Protocol string
3 changes: 3 additions & 0 deletions relay/server/listener/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ package listener
import (
"context"
"net"

"github.com/netbirdio/netbird/relay/protocol"
)

type Listener interface {
Listen(func(conn net.Conn)) error
Shutdown(ctx context.Context) error
Protocol() protocol.Protocol
}
Loading
Loading