Skip to content

Commit 29c22af

Browse files
authored
inbound: Improve policy metrics (#1237)
We recently introduced metrics to help surface inbound policy decisions, but in practice these haven't been as useful as we might hope. Specifically, error metrics do not include the `target_addr` label so these metrics can't be correlated with servers, etc. This change improves error metrics and also introduces new metrics to describe authorization decisions: authorization denials shouldn't be classified as errors, really, anyway. This change also improves TCP forwarding authorization so that policy changes can be honored at runtime: previously authorized connections may dropped if the policy is updated so that the connection is no longer authorized. The gateway is also updated to enforce HTTP policies at runtime as well so that policy changes can be honored after the connection has been established. This change introduces new metrics: * `inbound_http_authz_allow_total` * `inbound_http_authz_deny_total` * `inbound_tcp_authz_allow_total` * `inbound_tcp_authz_deny_total` * `inbound_tcp_authz_terminate_total` _allow_ metrics include `target_addr`, `srv_name`, and `saz_name` labels. _deny_ and _terminate_ metics include only `target_addr` and `srv_name` labels. Authorization denials are no longer reflected in inbound_tcp_error or inbound_http_error metrics. A number of internal changes have been made to support this: * The `inbound::policy::authorize` module includes middlewares for TCP and HTTP authorization, replacing the prior method of enforcing policy in the stack/router. This module ensures that metrics are recorded for policy decisions. * The `error-metrics` crate has been removed. In its place a `monitor` type has been added to the `stack` crate, supporting a general way to observe errors, decoupled from the metrics registry. * Inbound and outbound error metrics are now tracked in the inbound and outbound crates, respectively. Inbound- and outbound-specific error types are also moved into their rspective crates. * The `app_core::errors` module has been updated to only define the types it needs to instrument the error response layer. Error responses are now primarily instrumented via the `HttpError` type so that errors that should be handled can be configured where the error is thrown. The error type now holds an underlying source error so that the error metrics layer can see through this wrapper type to track the underlying error cause. * Server & Authorization labels are no longer handled as a free-form maps. We currently read only the `name` label from each; and this label is required.
1 parent 6adffd2 commit 29c22af

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1940
-1204
lines changed

Cargo.lock

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,6 @@ dependencies = [
668668
"linkerd-duplex",
669669
"linkerd-errno",
670670
"linkerd-error",
671-
"linkerd-error-metrics",
672671
"linkerd-error-respond",
673672
"linkerd-exp-backoff",
674673
"linkerd-http-classify",
@@ -750,6 +749,7 @@ dependencies = [
750749
"linkerd-tonic-watch",
751750
"linkerd-tracing",
752751
"linkerd2-proxy-api",
752+
"parking_lot",
753753
"thiserror",
754754
"tokio",
755755
"tokio-test",
@@ -801,6 +801,7 @@ dependencies = [
801801
"linkerd-identity",
802802
"linkerd-io",
803803
"linkerd-tracing",
804+
"parking_lot",
804805
"pin-project",
805806
"thiserror",
806807
"tokio",
@@ -910,17 +911,6 @@ dependencies = [
910911
"futures",
911912
]
912913

913-
[[package]]
914-
name = "linkerd-error-metrics"
915-
version = "0.1.0"
916-
dependencies = [
917-
"futures",
918-
"linkerd-metrics",
919-
"parking_lot",
920-
"pin-project",
921-
"tower",
922-
]
923-
924914
[[package]]
925915
name = "linkerd-error-respond"
926916
version = "0.1.0"

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ members = [
2222
"linkerd/duplex",
2323
"linkerd/error",
2424
"linkerd/errno",
25-
"linkerd/error-metrics",
2625
"linkerd/error-respond",
2726
"linkerd/exp-backoff",
2827
"linkerd/http-box",

linkerd/app/admin/src/stack.rs

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ use linkerd_app_core::{
77
serve,
88
svc::{self, ExtractParam, InsertParam, Param},
99
tls, trace,
10-
transport::{self, listen::Bind, ClientAddr, Local, Remote, ServerAddr},
11-
Error,
10+
transport::{self, listen::Bind, ClientAddr, Local, OrigDstAddr, Remote, ServerAddr},
11+
Error, Result,
1212
};
13+
use linkerd_app_inbound as inbound;
1314
use std::{pin::Pin, time::Duration};
1415
use thiserror::Error;
1516
use tokio::sync::mpsc;
@@ -64,7 +65,7 @@ impl Config {
6465
bind: B,
6566
identity: Option<LocalCrtKey>,
6667
report: R,
67-
metrics: metrics::Proxy,
68+
metrics: inbound::Metrics,
6869
trace: trace::Handle,
6970
drain: drain::Watch,
7071
shutdown: mpsc::UnboundedSender<()>,
@@ -79,11 +80,11 @@ impl Config {
7980
let (ready, latch) = crate::server::Readiness::new();
8081
let admin = crate::server::Admin::new(report, ready, shutdown, trace);
8182
let admin = svc::stack(move |_| admin.clone())
82-
.push(metrics.http_endpoint.to_layer::<classify::Response, _, Http>())
83+
.push(metrics.proxy.http_endpoint.to_layer::<classify::Response, _, Http>())
84+
.push(metrics.http_errors.to_layer())
8385
.push_on_service(
8486
svc::layers()
85-
.push(metrics.http_errors.clone())
86-
.push(errors::layer())
87+
.push(errors::respond::layer())
8788
.push(http::BoxResponse::layer()),
8889
)
8990
.push(http::NewServeHttp::layer(Default::default(), drain.clone()))
@@ -130,8 +131,10 @@ impl Config {
130131
)
131132
.push(svc::BoxNewService::layer())
132133
.push(detect::NewDetectService::layer(detect::Config::<http::DetectHttp>::from_timeout(DETECT_TIMEOUT)))
133-
.push(transport::metrics::NewServer::layer(metrics.transport))
134-
.push_map_target(|(tls, addrs): (tls::ConditionalServerTls, B::Addrs)| {
134+
.push(transport::metrics::NewServer::layer(metrics.proxy.transport))
135+
.push_map_target(move |(tls, addrs): (tls::ConditionalServerTls, B::Addrs)| {
136+
// TODO(ver): We should enforce policy here; but we need to permit liveness probes
137+
// for destination pods to startup...
135138
Tcp {
136139
tls,
137140
client: addrs.param(),
@@ -161,8 +164,7 @@ impl Param<transport::labels::Key> for Tcp {
161164
self.tls.clone(),
162165
self.addr.into(),
163166
// TODO(ver) enforce policies on the proxy's admin port.
164-
Default::default(),
165-
Default::default(),
167+
metrics::ServerLabel("default:admin".to_string()),
166168
)
167169
}
168170
}
@@ -175,13 +177,28 @@ impl Param<http::Version> for Http {
175177
}
176178
}
177179

180+
impl Param<OrigDstAddr> for Http {
181+
fn param(&self) -> OrigDstAddr {
182+
OrigDstAddr(self.tcp.addr.into())
183+
}
184+
}
185+
186+
impl Param<metrics::ServerLabel> for Http {
187+
fn param(&self) -> metrics::ServerLabel {
188+
metrics::ServerLabel("default:admin".to_string())
189+
}
190+
}
191+
178192
impl Param<metrics::EndpointLabels> for Http {
179193
fn param(&self) -> metrics::EndpointLabels {
180194
metrics::InboundEndpointLabels {
181195
tls: self.tcp.tls.clone(),
182196
authority: None,
183197
target_addr: self.tcp.addr.into(),
184-
policy: Default::default(),
198+
policy: metrics::AuthzLabels {
199+
server: self.param(),
200+
authz: "default:all-unauthenticated".to_string(),
201+
},
185202
}
186203
.into()
187204
}

linkerd/app/core/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ linkerd-detect = { path = "../../detect" }
2828
linkerd-duplex = { path = "../../duplex" }
2929
linkerd-errno = { path = "../../errno" }
3030
linkerd-error = { path = "../../error" }
31-
linkerd-error-metrics = { path = "../../error-metrics" }
3231
linkerd-error-respond = { path = "../../error-respond" }
3332
linkerd-exp-backoff = { path = "../../exp-backoff" }
3433
linkerd-http-classify = { path = "../../http-classify" }

linkerd/app/core/src/control.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ impl Config {
7979

8080
svc::stack(ConnectTcp::new(self.connect.keepalive))
8181
.push(tls::Client::layer(identity))
82-
.push_timeout(self.connect.timeout)
82+
.push_connect_timeout(self.connect.timeout)
8383
.push(self::client::layer())
8484
.push_on_service(svc::MapErrLayer::new(Into::into))
8585
.into_new_service()

linkerd/app/core/src/errors/mod.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
pub mod respond;
2+
3+
use linkerd_error::Error;
4+
pub use linkerd_timeout::{FailFastError, ResponseTimeout};
5+
use thiserror::Error;
6+
7+
#[derive(Debug, Error)]
8+
#[error("connect timed out after {0:?}")]
9+
pub(crate) struct ConnectTimeout(pub std::time::Duration);
10+
11+
#[derive(Debug, Error)]
12+
#[error("{source}")]
13+
pub struct HttpError {
14+
#[source]
15+
source: Error,
16+
http_status: http::StatusCode,
17+
grpc_status: tonic::Code,
18+
}
19+
20+
impl HttpError {
21+
pub fn bad_request(source: impl Into<Error>) -> Self {
22+
Self {
23+
source: source.into(),
24+
http_status: http::StatusCode::BAD_REQUEST,
25+
grpc_status: tonic::Code::InvalidArgument,
26+
}
27+
}
28+
29+
pub fn forbidden(source: impl Into<Error>) -> Self {
30+
Self {
31+
source: source.into(),
32+
http_status: http::StatusCode::FORBIDDEN,
33+
grpc_status: tonic::Code::PermissionDenied,
34+
}
35+
}
36+
37+
pub fn loop_detected(source: impl Into<Error>) -> Self {
38+
Self {
39+
source: source.into(),
40+
http_status: http::StatusCode::LOOP_DETECTED,
41+
grpc_status: tonic::Code::Aborted,
42+
}
43+
}
44+
}

0 commit comments

Comments
 (0)