Skip to content

Commit

Permalink
nfd-topology-updater: serve metrics and healthz on the same port
Browse files Browse the repository at this point in the history
Changes the gRPC health endpoint to plain http. At the same time starts
serving both the metrics and healthz endpoints on a single port.
Replaces the -metrics and -grpc-health command line flags with a single
-port flag.

Changes the Helm and kustomize deployments correspondingly.
  • Loading branch information
marquiz committed Nov 4, 2024
1 parent 65b5e0c commit 76337ed
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 78 deletions.
6 changes: 2 additions & 4 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not create or update NodeResourceTopology objects.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
"Port on which to expose the grpc health endpoint.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to metrics and healthz endpoints are served")
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ spec:
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -41,5 +43,5 @@ spec:
cpu: 50m
memory: 40Mi
ports:
- name: metrics
containerPort: 8081
- name: http
containerPort: 8080
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -60,8 +61,9 @@ spec:
timeoutSeconds: {{ . }}
{{- end }}
readinessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand Down Expand Up @@ -113,16 +115,13 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}"
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
- "-port={{ .Values.topologyUpdater.port | default "8080"}}"
{{- with .Values.topologyUpdater.extraArgs }}
{{- toYaml . | nindent 10 }}
{{- end }}
ports:
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
name: metrics
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
name: health
- containerPort: {{ .Values.topologyUpdater.port | default "8080"}}
name: http
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
Expand Down
9 changes: 2 additions & 7 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,7 @@ topologyUpdater:
rbac:
create: true

metricsPort: 8081
healthPort: 8082
port: 8080
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
Expand All @@ -529,17 +528,13 @@ topologyUpdater:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsUser: 0

livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
# timeoutSeconds: 1
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes |
| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
Expand Down
70 changes: 21 additions & 49 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@ package nfdtopologyupdater

import (
"fmt"
"net"
"net/http"
"net/url"
"os"
"path/filepath"

"golang.org/x/net/context"

"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,6 +34,7 @@ import (

"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier"
"sigs.k8s.io/node-feature-discovery/pkg/podres"
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
Expand All @@ -56,13 +54,12 @@ const (

// Args are the command line arguments
type Args struct {
MetricsPort int
Port int
NoPublish bool
Oneshot bool
KubeConfigFile string
ConfigFile string
KubeletStateDir string
GrpcHealthPort int

Klog map[string]*utils.KlogFlagVal
}
Expand Down Expand Up @@ -90,7 +87,6 @@ type nfdTopologyUpdater struct {
ownerRefs []metav1.OwnerReference
k8sClient k8sclient.Interface
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
healthServer *grpc.Server
}

// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
Expand Down Expand Up @@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
}

func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
w.healthServer = s
return nil
func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) {
writer.WriteHeader(http.StatusOK)
}

// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
Expand Down Expand Up @@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}

httpMux := http.NewServeMux()

// Register to metrics server
if w.args.MetricsPort > 0 {
m := utils.CreateMetricsServer(w.args.MetricsPort,
buildInfo,
scanErrors)
go m.Run()
registerVersion(version.Get())
defer m.Stop()
}
promRegistry := prometheus.NewRegistry()
promRegistry.MustRegister(
buildInfo,
scanErrors)
registerVersion(version.Get())

var resScan resourcemonitor.ResourcesScanner

Expand All @@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("failed to obtain node resource information: %w", err)
}

grpcErr := make(chan error)
// Register health probe (at this point we're "ready and live")
httpMux.HandleFunc("/healthz", w.Healthz)

// Start gRPC server for liveness probe (at this point we're "live")
if w.args.GrpcHealthPort != 0 {
if err := w.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Start HTTP server
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
go func() {
klog.InfoS("http server starting", "port", httpServer.Addr)
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
}()
defer httpServer.Close()

for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

case info := <-w.eventSource:
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
scanResponse, err := resScan.Scan()
Expand Down Expand Up @@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error {

case <-w.stop:
klog.InfoS("shutting down nfd-topology-updater")
if w.healthServer != nil {
w.healthServer.GracefulStop()
}
return nil
}
}
Expand Down

0 comments on commit 76337ed

Please sign in to comment.