This content originally appeared on DEV Community and was authored by ByteLedger
#!/bin/bash
# =============================================================
# Script: Cronjob_monitorpod.sh
# Purpose:
# 1. Trigger a Kubernetes CronJob manually.
# 2. Monitor the created Job & Pod lifecycle.
# 3. Collect pod logs and periodic CPU/memory metrics.
# 4. Additionally, detect "/app/ShieldCache" activity in logs
# and correlate it with real-time pod resource usage.
#
# Author:
# Compatible OS: Linux (RHEL/CentOS)
# =============================================================
# --- 1. Basic Setup ---
SCRIPT="$(realpath -s "$0")"
SCRIPTPATH="$(dirname "$SCRIPT")"
NAMESPACE="onprem" # Namespace where the pods are located
CRONJOB_NAME="teams-chat" # Name of the CronJob you want to trigger the job from
JOB_NAME_BASE="teams-chat" # Base job name (will add timestamp)
TIMESTAMP="$(date +'%Y%m%d_%H%M%S')"
JOB_NAME="${JOB_NAME_BASE}-${TIMESTAMP}" # Unique job name per run
# Folder setup
LOG_DIR="$SCRIPTPATH/Logs"
LASTLOG_DIR="$SCRIPTPATH/Lastlog"
OUTPUT_DIR="$SCRIPTPATH/output"
mkdir -p "$LOG_DIR" "$LASTLOG_DIR" "$OUTPUT_DIR" >/dev/null 2>&1 || true
# File paths
LOG_PATH="$LOG_DIR/${TIMESTAMP}-${JOB_NAME_BASE}.log"
RESOURCE_CSV="$LOG_DIR/${TIMESTAMP}-${JOB_NAME_BASE}_pod_resources.csv"
CACHE_MONITOR_CSV="$LOG_DIR/${TIMESTAMP}-${JOB_NAME_BASE}_shieldcache_activity.csv"
# Keywords
ERROR_KEYWORDS=("java.io.EOFException" "NoSuchMethodError" "Error")
SUCCESS_KEYWORDS=("TGM Policy Ended" "Job completed")
# =============================================================
# 2. Validate the Namespace
# =============================================================
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Checking if namespace '$NAMESPACE' exists..."
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Namespace '$NAMESPACE' not found. Exiting." | tee -a "$LOG_PATH"
cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
exit 1
fi
# =============================================================
# 3. Trigger the CronJob manually (create a Job)
# =============================================================
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Creating Job '$JOB_NAME' from CronJob '$CRONJOB_NAME' in namespace '$NAMESPACE'..." | tee -a "$LOG_PATH"
if ! kubectl create job --from=cronjob/"$CRONJOB_NAME" -n "$NAMESPACE" "$JOB_NAME" >/dev/null; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Failed to create job from CronJob." | tee -a "$LOG_PATH"
exit 1
fi
# =============================================================
# 4. Wait for the Pod to appear for this Job (pick newest)
# =============================================================
get_newest_pod() {
kubectl -n "$NAMESPACE" get pods -l "job-name=${JOB_NAME}" \
--sort-by=.metadata.creationTimestamp \
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | tail -n 1
}
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Waiting for Pod associated with Job '$JOB_NAME'..." | tee -a "$LOG_PATH"
POD_NAME=""
while true; do
POD_NAME="$(get_newest_pod)"
if [[ -n "$POD_NAME" ]]; then
NODE_NAME="$(kubectl get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.nodeName}')"
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod '$POD_NAME' created on node '$NODE_NAME'." | tee -a "$LOG_PATH"
break
fi
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod not yet created... retrying in 10s." | tee -a "$LOG_PATH"
sleep 10
done
# =============================================================
# 5. Prepare CSV headers
# =============================================================
echo "timestamp,pod,cpu,mem" > "$RESOURCE_CSV"
echo "timestamp,pod,cpu,mem,log_snippet" > "$CACHE_MONITOR_CSV"
# =============================================================
# 6. Start ShieldCache-specific monitor (background)
# =============================================================
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Starting background monitor for /app/ShieldCache activity..." | tee -a "$LOG_PATH"
kubectl logs -f "$POD_NAME" -n "$NAMESPACE" --timestamps=true 2>/dev/null | while read -r line; do
if echo "$line" | grep -q "/app/ShieldCache"; then
ts="$(date +'%Y-%m-%d %H:%M:%S')"
usage_line="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk '{print $2","$3}')"
short_msg="$(echo "$line" | awk '{print substr($0,1,150)}')" # truncate long lines
echo "$ts,$POD_NAME,$usage_line,\"$short_msg\"" >> "$CACHE_MONITOR_CSV"
fi
done &
CACHE_MON_PID=$!
# =============================================================
# 7. Main pod monitoring loop
# =============================================================
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Monitoring pod '$POD_NAME' resource usage..." | tee -a "$LOG_PATH"
while true; do
POD_STATUS="$(kubectl get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "")"
# Periodic CPU/MEM usage collection
USAGE_LINE="$(kubectl top pod "$POD_NAME" -n "$NAMESPACE" --no-headers 2>/dev/null | awk -v ts="$(date +'%Y-%m-%d %H:%M:%S')" '{print ts","$1","$2","$3}')"
[[ -n "$USAGE_LINE" ]] && echo "$USAGE_LINE" >> "$RESOURCE_CSV"
case "$POD_STATUS" in
Running|Pending)
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod '$POD_NAME' is $POD_STATUS. Sleeping 30s..." | tee -a "$LOG_PATH"
sleep 30
;;
Succeeded|Completed)
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Pod '$POD_NAME' completed successfully." | tee -a "$LOG_PATH"
break
;;
Failed|Error|CrashLoopBackOff)
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Pod '$POD_NAME' failed with status $POD_STATUS." | tee -a "$LOG_PATH"
kill $CACHE_MON_PID >/dev/null 2>&1
exit 2
;;
*)
echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN] Unknown pod status '$POD_STATUS'. Waiting 20s..." | tee -a "$LOG_PATH"
sleep 20
;;
esac
done
# =============================================================
# 8. Fetch final logs (after completion)
# =============================================================
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Fetching final logs for pod '$POD_NAME'..." | tee -a "$LOG_PATH"
kubectl logs "$POD_NAME" -n "$NAMESPACE" --timestamps=true >> "$LOG_PATH" 2>&1
# =============================================================
# 9. Check for error & success keywords
# =============================================================
for keyword in "${ERROR_KEYWORDS[@]}"; do
if grep -iq -- "$keyword" "$LOG_PATH"; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [ERROR] Found error keyword: '$keyword'" | tee -a "$LOG_PATH"
cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
kill $CACHE_MON_PID >/dev/null 2>&1
exit 3
fi
done
for keyword in "${SUCCESS_KEYWORDS[@]}"; do
if ! grep -iq -- "$keyword" "$LOG_PATH"; then
echo "$(date +'%Y-%m-%d %H:%M:%S') [WARN] Success keyword '$keyword' not found." | tee -a "$LOG_PATH"
cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
kill $CACHE_MON_PID >/dev/null 2>&1
exit 4
fi
done
# =============================================================
# 10. Wrap up
# =============================================================
kill $CACHE_MON_PID >/dev/null 2>&1 || true
cp "$LOG_PATH" "$LASTLOG_DIR/lastpodlog.txt" 2>/dev/null || true
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Job '$JOB_NAME' completed successfully." | tee -a "$LOG_PATH"
echo "$(date +'%Y-%m-%d %H:%M:%S') [INFO] Output saved to: $LOG_DIR" | tee -a "$LOG_PATH"
exit 0
This content originally appeared on DEV Community and was authored by ByteLedger