Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion deploy/helm/moai-inference-framework/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,13 @@ Moreh Inference Framework
| vector.customConfig.sinks.loki.labels.pool_name | string | `"{{`{{ pool_name }}`}}"` | |
| vector.customConfig.sinks.loki.labels.role | string | `"{{`{{ role }}`}}"` | |
| vector.customConfig.sinks.loki.type | string | `"loki"` | |
| vector.customConfig.sources.aigateway_logs.extra_label_selector | string | `"app.kubernetes.io/name=aigateway"` | |
| vector.customConfig.sources.aigateway_logs.type | string | `"kubernetes_logs"` | |
| vector.customConfig.sources.mif_logs.extra_label_selector | string | `"mif.moreh.io/log.collect=true"` | |
| vector.customConfig.sources.mif_logs.type | string | `"kubernetes_logs"` | |
| vector.customConfig.transforms.mif_log_transform.inputs[0] | string | `"mif_logs"` | |
| vector.customConfig.transforms.mif_log_transform.source | string | `".namespace = .kubernetes.pod_namespace\n.node_name = \"$VECTOR_SELF_NODE_NAME\"\n.app = get(.kubernetes.pod_labels, [\"app.kubernetes.io/name\"]) ?? \"\"\n.inference_service = get(.kubernetes.pod_labels, [\"app.kubernetes.io/instance\"]) ?? \"\"\n.pool_name = get(.kubernetes.pod_labels, [\"mif.moreh.io/pool\"]) ?? \"\"\n.role = get(.kubernetes.pod_labels, [\"mif.moreh.io/role\"]) ?? \"\"\n\nlog_format = get(.kubernetes.pod_labels, [\"mif.moreh.io/log.format\"]) ?? \"\"\n\nif log_format == \"json\" {\n structured, err = parse_json(.message)\n if err == null {\n . = merge!(., structured)\n msg, err = get(., [\"msg\"])\n if err == null {\n .message = msg\n del(.msg)\n }\n time, err = get(., [\"time\"])\n if err == null {\n .timestamp = time\n del(.time)\n }\n }\n}\n\ndel(.file)\ndel(.source_type)\ndel(.stream)\ndel(.kubernetes)\n"` | |
| vector.customConfig.transforms.mif_log_transform.inputs[1] | string | `"aigateway_logs"` | |
| vector.customConfig.transforms.mif_log_transform.source | string | `".namespace = .kubernetes.pod_namespace\n.node_name = \"$VECTOR_SELF_NODE_NAME\"\n.app = get(.kubernetes.pod_labels, [\"app.kubernetes.io/name\"]) ?? \"\"\n.inference_service = get(.kubernetes.pod_labels, [\"app.kubernetes.io/instance\"]) ?? \"\"\n.pool_name = get(.kubernetes.pod_labels, [\"mif.moreh.io/pool\"]) ?? \"\"\n.role = get(.kubernetes.pod_labels, [\"mif.moreh.io/role\"]) ?? \"\"\n\nlog_format = get(.kubernetes.pod_labels, [\"mif.moreh.io/log.format\"]) ?? \"\"\n\n# AIGateway always emits flat JSON (timestamp/level/target/message plus\n# request_id/trace_id span fields) but cannot carry the log.format opt-in\n# label, so parse JSON for it unconditionally.\nif log_format == \"json\" || .app == \"aigateway\" {\n structured, err = parse_json(.message)\n if err == null {\n . = merge!(., structured)\n # Go slog emits \"msg\"/\"time\"; Rust components (AIGateway) emit\n # \"message\"/\"timestamp\", which merge! already placed directly.\n # Guard the slog promotion on field existence: get(., [\"msg\"])\n # returns null WITHOUT an error when \"msg\" is absent, so an\n # `err == null` guard would null the merged message/timestamp.\n if exists(.msg) {\n .message = .msg\n del(.msg)\n }\n if exists(.time) {\n .timestamp = .time\n del(.time)\n }\n }\n}\n\ndel(.file)\ndel(.source_type)\ndel(.stream)\ndel(.kubernetes)\n"` | |
| vector.customConfig.transforms.mif_log_transform.type | string | `"remap"` | |
| vector.enabled | bool | `true` | Enable vector/vector as a DaemonSet log collector. |
| vector.role | string | `"Agent"` | |
Expand Down
30 changes: 23 additions & 7 deletions deploy/helm/moai-inference-framework/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -401,11 +401,21 @@ vector:
type: kubernetes_logs
# Only collect logs from pods that explicitly opt in to MIF log collection.
extra_label_selector: "mif.moreh.io/log.collect=true"
aigateway_logs:
type: kubernetes_logs
# AIGateway is a first-class MIF component. The Heimdall controller stamps
# the immutable label app.kubernetes.io/name=aigateway on every AIGateway
# pod and exposes no field to set mif.moreh.io/log.collect, so its logs are
# collected automatically by that label — symmetric with how the controller
# auto-creates a ServiceMonitor/PodMonitor for its metrics (no per-pod opt-in).
# Disjoint from mif_logs: an AIGateway pod has no path to log.collect=true.
extra_label_selector: "app.kubernetes.io/name=aigateway"
transforms:
mif_log_transform:
type: remap
inputs:
- mif_logs
- aigateway_logs
source: |
.namespace = .kubernetes.pod_namespace
.node_name = "$VECTOR_SELF_NODE_NAME"
Expand All @@ -416,18 +426,24 @@ vector:

log_format = get(.kubernetes.pod_labels, ["mif.moreh.io/log.format"]) ?? ""

if log_format == "json" {
# AIGateway always emits flat JSON (timestamp/level/target/message plus
# request_id/trace_id span fields) but cannot carry the log.format opt-in
# label, so parse JSON for it unconditionally.
if log_format == "json" || .app == "aigateway" {
structured, err = parse_json(.message)
if err == null {
. = merge!(., structured)
msg, err = get(., ["msg"])
if err == null {
.message = msg
# Go slog emits "msg"/"time"; Rust components (AIGateway) emit
# "message"/"timestamp", which merge! already placed directly.
# Guard the slog promotion on field existence: get(., ["msg"])
# returns null WITHOUT an error when "msg" is absent, so an
# `err == null` guard would null the merged message/timestamp.
if exists(.msg) {
.message = .msg
del(.msg)
}
time, err = get(., ["time"])
if err == null {
.timestamp = time
if exists(.time) {
.timestamp = .time
del(.time)
}
}
Expand Down
58 changes: 43 additions & 15 deletions website/docs/operations/monitoring/logs/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This document explains how to enable centralized log collection for the MoAI Inf

```mermaid
flowchart TB
pods["`**Inference Service Pods**`"]
pods["`**Inference Service & AIGateway Pods**`"]
vector["`**Vector**`"]
grafana["`**Grafana**`"]

Expand Down Expand Up @@ -46,8 +46,8 @@ flowchart TB
| :---------- | :------------------------------------------------------------------------ |
| Helm chart | `vector/vector` v0.39.0 |
| Deployment | DaemonSet (Agent mode, one pod per node) |
| Log source | Pods labelled `mif.moreh.io/log.collect=true` (`kubernetes_logs`) |
| Log format | JSON parsing applied only to pods labelled `mif.moreh.io/log.format=json` |
| Log source | Pods labelled `mif.moreh.io/log.collect=true`, plus AIGateway pods (`app.kubernetes.io/name=aigateway`) collected automatically (`kubernetes_logs`) |
| Log format | JSON parsing applied to pods labelled `mif.moreh.io/log.format=json`, and always to AIGateway pods |
| Tolerations | unschedulable, compute, `amd.com/gpu` |

### MinIO
Expand Down Expand Up @@ -131,11 +131,11 @@ kubectl logs -n mif -l app.kubernetes.io/name=vector --tail=50

## Enabling log collection for a pod

Vector collects logs only from pods that explicitly opt in. Two pod labels control this behavior.
Most pods opt in to log collection explicitly, controlled by the two pod labels below. First-class components such as AIGateway are collected automatically — see [Automatically collected components](#automatically-collected-components).

### Opt-in label

Add the `mif.moreh.io/log.collect=true` label to a pod to include its logs in Vector's collection. Pods without this label are ignored entirely.
Add the `mif.moreh.io/log.collect=true` label to a pod to include its logs in Vector's collection. Pods without this label are ignored, except for components collected automatically (see [Automatically collected components](#automatically-collected-components)).

```yaml
metadata:
Expand All @@ -147,12 +147,14 @@ metadata:

Add the `mif.moreh.io/log.format=json` label to enable structured JSON log parsing for a pod. When set, Vector parses each log line as JSON and promotes the following fields:

| JSON field | Mapped to |
| :--------- | :-------------------- |
| `msg` | `message` |
| `time` | `timestamp` |
| `level` | `level` (Loki label) |
| others | merged into the event |
| JSON field | Mapped to |
| :-------------------- | :-------------------- |
| `msg` or `message` | `message` |
| `time` or `timestamp` | `timestamp` |
| `level` | `level` (Loki label) |
| others | merged into the event |

Both common conventions are accepted: Go components emit `msg`/`time` (for example, the Heimdall scheduler), while Rust components emit `message`/`timestamp` (for example, AIGateway).

Without this label, the log line is forwarded as-is without any JSON parsing.

Expand All @@ -164,11 +166,19 @@ metadata:
```

:::info
The `level` Loki label is only populated for pods with `mif.moreh.io/log.format=json`. For plain-text pods, `level` remains empty.
The `level` Loki label is only populated for JSON-parsed pods (those labelled `mif.moreh.io/log.format=json`, plus AIGateway). For plain-text pods, `level` remains empty.
:::

---

## Automatically collected components

AIGateway pods are collected automatically — no opt-in label is required. The Heimdall controller stamps the immutable label `app.kubernetes.io/name=aigateway` on every AIGateway pod and exposes no field to set `mif.moreh.io/log.collect`, so Vector selects these pods by that label and always parses their JSON output. This mirrors AIGateway metrics, which the controller exposes through an auto-created ServiceMonitor and PodMonitor with no per-pod configuration.

Query AIGateway logs in Grafana with the `app="aigateway"` selector (see [Searching logs in Grafana](#searching-logs-in-grafana)).

---

## Searching logs in Grafana

### Accessing Grafana
Expand Down Expand Up @@ -209,9 +219,9 @@ Vector enriches every log entry with the following labels, which can be used as
| `inference_service` | pod label `app.kubernetes.io/instance` | `llama-3-2-1b` |
| `pool_name` | pod label `mif.moreh.io/pool` | `heimdall-inference-scheduler` |
| `role` | pod label `mif.moreh.io/role` | `prefill`, `decode` |
| `app` | pod label `app.kubernetes.io/name` | `vllm` |
| `app` | pod label `app.kubernetes.io/name` | `vllm`, `aigateway` |
| `node_name` | `VECTOR_SELF_NODE_NAME` env var (injected by Vector) | `gpu-node-01` |
| `level` | parsed from JSON log field `level` (pods with `mif.moreh.io/log.format=json` only) | `info`, `warn`, `error` |
| `level` | parsed from JSON log field `level` (JSON-parsed pods only) | `info`, `warn`, `error` |

### Query examples

Expand All @@ -237,13 +247,31 @@ Filter by log level (available only for JSON-formatted pods):
```

:::info
The `level` label is only available for pods with the `mif.moreh.io/log.format=json` label. To filter plain-text logs by level, use a pipeline filter instead:
The `level` label is only available for JSON-parsed pods. To filter plain-text logs by level, use a pipeline filter instead:

```promql
{namespace="default"} |= "ERROR"
```
:::

### AIGateway logs

AIGateway logs are collected automatically and always parsed as JSON. Select them with the `app` label, and use `| json` to expose fields such as `target`, `request_id`, and `trace_id`:

```promql {3}
{app="aigateway"}
{app="aigateway"} | json
{app="aigateway"} | json | request_id="<requestId>"
```

AIGateway emits uppercase levels (`INFO`, `DEBUG`, `WARN`, `ERROR`), so filter by level with the uppercase value:

```promql
{app="aigateway", level="DEBUG"}
```

Each parsed line includes a `trace_id` field for correlating a log with its trace in Tempo. Automatic log-to-trace links require a `derivedFields` entry on the Loki datasource, which this chart does not configure by default.

---

## Using an external MinIO
Expand Down
Loading