- Version: 1.1.2
- Last Updated: January 2026
This guide helps diagnose and resolve common issues with TelemetryFlow Agent.
# Check if agent is running
ps aux | grep tfo-agent
# Check systemd status
sudo systemctl status tfo-agent
# Check agent logs
journalctl -u tfo-agent -f
# Check health endpoint
curl http://localhost:13133/# Validate configuration file
./tfo-agent config --config /etc/tfo-agent/tfo-agent.yaml
# Show parsed configuration
./tfo-agent config --config /etc/tfo-agent/tfo-agent.yaml --format yaml# Test OTLP endpoint connectivity
nc -zv localhost 4317
# Test with grpcurl (if installed)
grpcurl -plaintext localhost:4317 list
# Test HTTP endpoint
curl -v http://localhost:4318/v1/metricsAgent exits immediately after starting with an error.
Configuration file not found:
# Error: configuration file not found
# Solution: Specify correct path
./tfo-agent start --config /path/to/tfo-agent.yaml
# Check default locations
ls -la /etc/tfo-agent/tfo-agent.yaml
ls -la ./configs/tfo-agent.yamlInvalid configuration:
# Error: validation failed
# Solution: Validate configuration
./tfo-agent config --config /path/to/tfo-agent.yaml
# Common issues:
# - Missing telemetryflow.endpoint
# - Invalid heartbeat.interval (< 10s)
# - Invalid protocol (must be grpc or http)Permission denied:
# Error: permission denied
# Solution: Check file permissions
ls -la /etc/tfo-agent/tfo-agent.yaml
sudo chown telemetryflow:telemetryflow /etc/tfo-agent/tfo-agent.yaml
sudo chmod 640 /etc/tfo-agent/tfo-agent.yaml
# Check buffer directory permissions
sudo mkdir -p /var/lib/tfo-agent/buffer
sudo chown -R telemetryflow:telemetryflow /var/lib/tfo-agentPort already in use:
# Error: address already in use
# Solution: Check what's using the port
lsof -i :4317
lsof -i :4318
# Kill existing process or use different portAgent starts but cannot connect to the collector/backend.
Check endpoint configuration:
telemetryflow:
endpoint: "collector.example.com:4317" # Correct format
# endpoint: "http://collector.example.com:4317" # Wrong for gRPC
protocol: grpcTLS issues:
telemetryflow:
endpoint: "collector.example.com:4317"
tls:
enabled: true
skip_verify: false # Set to true for self-signed certs (dev only)Network connectivity:
# Test DNS resolution
nslookup collector.example.com
# Test TCP connectivity
telnet collector.example.com 4317
nc -zv collector.example.com 4317
# Test from container
docker exec tfo-agent nc -zv collector.example.com 4317Firewall rules:
# Check iptables
sudo iptables -L -n | grep 4317
# Allow OTLP ports
sudo ufw allow 4317/tcp
sudo ufw allow 4318/tcpAgent connects but receives 401/403 errors.
Check API credentials:
telemetryflow:
api_key_id: "tfk_your_key_id" # Must start with tfk_
api_key_secret: "tfs_your_secret" # Must start with tfs_Environment variables:
# Check if env vars are set
echo $TELEMETRYFLOW_API_KEY_ID
echo $TELEMETRYFLOW_API_KEY_SECRET
# Set environment variables
export TELEMETRYFLOW_API_KEY_ID="tfk_xxx"
export TELEMETRYFLOW_API_KEY_SECRET="tfs_xxx"Verify credentials in logs:
# Run with debug logging to see auth headers
./tfo-agent start --config config.yaml --log-level debug 2>&1 | grep -i authAgent is running but metrics are not visible in the backend.
Check collector configuration:
collector:
system:
enabled: true # Must be true
interval: 15s
cpu: true
memory: true
disk: true
network: trueVerify exporter is running:
# Check logs for export messages
journalctl -u tfo-agent | grep -i "export"
journalctl -u tfo-agent | grep -i "metric"Check batch settings:
exporter:
otlp:
enabled: true
batch_size: 100
flush_interval: 10s # Metrics sent every 10 secondsVerify metrics are being collected:
# Run with debug logging
./tfo-agent start --config config.yaml --log-level debug
# Look for collection messages
# "Collected metrics" with count > 0Agent consumes excessive memory over time.
Adjust buffer settings:
buffer:
enabled: true
max_size_mb: 50 # Reduce from 100
path: "/var/lib/tfo-agent/buffer"Reduce batch size:
exporter:
otlp:
batch_size: 50 # Reduce from 100
flush_interval: 5s # Flush more frequentlyCheck for memory leaks:
# Monitor memory usage
watch -n 5 'ps -o rss,vsz,pid,cmd -p $(pgrep tfo-agent)'
# Use pprof if enabled
curl http://localhost:8888/debug/pprof/heap > heap.prof
go tool pprof heap.profAgent consumes excessive CPU.
Increase collection interval:
collector:
system:
interval: 30s # Increase from 15sReduce collectors:
collector:
system:
enabled: true
cpu: true
memory: true
disk: false # Disable if not needed
network: false # Disable if not neededCheck for busy loops:
# Profile CPU usage
curl http://localhost:8888/debug/pprof/profile?seconds=30 > cpu.prof
go tool pprof cpu.profBuffer grows indefinitely or agent can't write to buffer.
Check disk space:
df -h /var/lib/tfo-agent
du -sh /var/lib/tfo-agent/bufferClear stale buffer:
# Stop agent first
sudo systemctl stop tfo-agent
# Clear buffer directory
sudo rm -rf /var/lib/tfo-agent/buffer/*
# Restart agent
sudo systemctl start tfo-agentVerify buffer directory permissions:
ls -la /var/lib/tfo-agent/buffer
sudo chown -R telemetryflow:telemetryflow /var/lib/tfo-agentAgent disconnects frequently or shows as offline in backend.
Adjust heartbeat settings:
heartbeat:
interval: 60s # Default is fine for most cases
timeout: 10s # Increase if network is slow
include_system_info: trueCheck network stability:
# Continuous ping test
ping -c 100 collector.example.com
# Check for packet loss
mtr collector.example.comVerify heartbeat in logs:
journalctl -u tfo-agent | grep -i heartbeat# Check container logs
docker logs tfo-agent
# Check if config is mounted correctly
docker exec tfo-agent cat /etc/tfo-agent/tfo-agent.yaml
# Verify volume mounts
docker inspect tfo-agent | jq '.[].Mounts'# Use host network for testing
docker run --network host telemetryflow/telemetryflow-agent:latest ...
# Or check bridge network
docker network inspect bridge# Check pod status
kubectl get pods -l app=tfo-agent
# Check pod events
kubectl describe pod <pod-name>
# Check logs
kubectl logs <pod-name># Verify ConfigMap exists
kubectl get configmap tfo-agent-config -o yaml
# Check volume mount
kubectl exec <pod-name> -- cat /etc/tfo-agent/tfo-agent.yaml# Check service account
kubectl get serviceaccount tfo-agent -o yaml
# Check RBAC
kubectl auth can-i --list --as=system:serviceaccount:default:tfo-agent# Command line
./tfo-agent start --config config.yaml --log-level debug
# Environment variable
export TELEMETRYFLOW_LOG_LEVEL=debug
./tfo-agent start --config config.yaml
# In configuration
logging:
level: debug
format: json# Full debug output to file
./tfo-agent start --config config.yaml --log-level debug 2>&1 | tee agent.log
# Filter specific components
journalctl -u tfo-agent | grep -E "(exporter|collector|heartbeat)"
# Real-time log watching
tail -f /var/log/tfo-agent/agent.logSuccessful startup:
{"level":"info","msg":"Starting TelemetryFlow Agent","version":"1.1.2"}
{"level":"info","msg":"Configuration loaded","file":"config.yaml"}
{"level":"info","msg":"Agent started","id":"agent-001","hostname":"server-01"}
{"level":"info","msg":"Starting OTLP exporter","endpoint":"localhost:4317"}
{"level":"info","msg":"OTLP exporter started successfully"}
{"level":"info","msg":"Starting heartbeat","interval":"60s"}Connection issues:
{"level":"error","msg":"Failed to export metrics","error":"connection refused"}
{"level":"warn","msg":"Retrying export","attempt":2,"max_attempts":3}Authentication issues:
{ "level": "error", "msg": "Heartbeat failed", "error": "status code: 401" }# Basic health check
curl http://localhost:13133/
# Expected response
{"status":"healthy","version":"1.1.2"}# Prometheus metrics
curl http://localhost:8888/metrics
# Look for:
# tfo_agent_up 1
# tfo_agent_heartbeat_success_total
# tfo_agent_export_success_totalBefore requesting help, collect:
# Agent version
./tfo-agent version --json > diagnostics.json
# Configuration (sanitize secrets)
./tfo-agent config --config config.yaml --format yaml | sed 's/api_key_secret:.*/api_key_secret: [REDACTED]/' >> diagnostics.txt
# Recent logs
journalctl -u tfo-agent --since "1 hour ago" >> diagnostics.txt
# System info
uname -a >> diagnostics.txt
cat /etc/os-release >> diagnostics.txt- Issues: GitHub Issues
- Documentation: https://docs.telemetryflow.id
- Email: support@telemetryflow.id
Copyright (c) 2024-2026 DevOpsCorner Indonesia. All rights reserved.