From a59d3b565dfb027abdc9fc599fe32b3891c82340 Mon Sep 17 00:00:00 2001 From: Alexandre <44178713+alexbelgium@users.noreply.github.com> Date: Tue, 4 Feb 2025 14:11:55 +0100 Subject: [PATCH] Update 30-monitoring.sh --- .../rootfs/custom-services.d/30-monitoring.sh | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh b/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh index f9615cac0..d0378fcb1 100755 --- a/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh +++ b/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash # shellcheck shell=bash -# Improved BirdNET-Pi Monitoring Script -# Adapted and enhanced based on your original script. +# Improved BirdNET-Pi Monitoring Script with Recovery Alerts HOME="/home/pi" @@ -21,7 +20,7 @@ set +u source /etc/birdnet/birdnet.conf ######################################## -# Wait 1 minutes for system stabilization +# Wait 1 minute for system stabilization ######################################## sleep 1m @@ -32,7 +31,6 @@ log_green "Starting service: throttlerecording" ######################################## INGEST_DIR="${RECS_DIR/StreamData:-$HOME/BirdSongs/StreamData}" ANALYZING_NOW_FILE="$INGEST_DIR/analyzing_now.txt" -# Create the file if it does not exist. touch "$ANALYZING_NOW_FILE" # Ensure directories and set permissions @@ -47,14 +45,14 @@ ANALYZER_SERVICE="birdnet_analysis" # Notification settings NOTIFICATION_INTERVAL=1800 # seconds (30 minutes) last_notification_time=0 +issue_reported=0 # 1 = an issue was reported, 0 = system is normal # Disk usage threshold (percentage) DISK_USAGE_THRESHOLD=95 # "Analyzing" file check variables same_file_counter=0 -SAME_FILE_THRESHOLD=10 # number of iterations to consider the file as "stuck" -# Initialize the content of analyzing_now from the file +SAME_FILE_THRESHOLD=10 if [[ -f "$ANALYZING_NOW_FILE" ]]; then analyzing_now=$(cat "$ANALYZING_NOW_FILE") else @@ -65,14 +63,13 @@ fi # Functions ######################################## -# Send a notification using Apprise. +# Send an issue notification apprisealert() { local issue_message="$1" local current_time current_time=$(date +%s) local time_diff=$(( current_time - last_notification_time )) - # Throttle notifications if (( time_diff < NOTIFICATION_INTERVAL )); then log_yellow "Notification suppressed (last sent ${time_diff} seconds ago)" return @@ -81,7 +78,7 @@ apprisealert() { local notification="" local stopped_service="
Stopped services: " - # Check for stopped services (add or remove services as needed) + # Check for stopped services local services=(birdnet_analysis chart_viewer spectrogram_viewer icecast2 birdnet_recording birdnet_log birdnet_stats) for service in "${services[@]}"; do if [[ "$(systemctl is-active "$service")" != "active" ]]; then @@ -89,7 +86,6 @@ apprisealert() { fi done - # Build the notification message in HTML format. notification+="Issue: $issue_message" notification+="$stopped_service" notification+="
System: ${SITE_NAME:-$(hostname)}" @@ -100,54 +96,78 @@ apprisealert() { if [[ -f "$HOME/BirdNET-Pi/birdnet/bin/apprise" && -s "$HOME/BirdNET-Pi/apprise.txt" ]]; then "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" last_notification_time=$current_time + issue_reported=1 # Mark that an issue was reported else log_red "Apprise not configured or missing!" fi } -# Check and restart a given service if it is not active. +# Send a "System is back to normal" notification +apprisealert_recovery() { + if (( issue_reported == 1 )); then + log_green "$(date) INFO: System is back to normal. Sending recovery notification." + + local TITLE="BirdNET-Pi System Recovered" + local notification="All monitored services are back to normal.
" + notification+="System: ${SITE_NAME:-$(hostname)}
" + notification+="Available disk space: $(df -h "$HOME/BirdSongs" | awk 'NR==2 {print $4}')" + + if [[ -f "$HOME/BirdNET-Pi/birdnet/bin/apprise" && -s "$HOME/BirdNET-Pi/apprise.txt" ]]; then + "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" + fi + issue_reported=0 # Reset issue tracker + fi +} + +# Restart a service if inactive check_and_restart_service() { local service_name="$1" local state state=$(systemctl is-active "$service_name") + if [[ "$state" != "active" ]]; then log_yellow "$(date) INFO: Restarting $service_name" sudo systemctl restart "$service_name" sleep 61 state=$(systemctl is-active "$service_name") + if [[ "$state" != "active" ]]; then log_red "$(date) WARNING: $service_name could not restart" - apprisealert "$service_name cannot restart ! Your system seems stuck." + apprisealert "$service_name cannot restart! Your system seems stuck." fi fi } -# Check disk usage and send a notification if above threshold. +# Check disk usage check_disk_space() { local current_usage current_usage=$(df -h "$HOME/BirdSongs" | awk 'NR==2 {print $5}' | sed 's/%//') + if (( current_usage >= DISK_USAGE_THRESHOLD )); then - log_red "$(date) WARNING: Disk usage is at ${current_usage}% (threshold is ${DISK_USAGE_THRESHOLD}%)" + log_red "$(date) WARNING: Disk usage is at ${current_usage}%" apprisealert "Disk usage critical: ${current_usage}%" + else + apprisealert_recovery fi } -# Handle the file queue (number of .wav files in the ingest directory). +# Handle queue size handle_queue() { local wav_count="$1" + if (( wav_count > 50 )); then - log_red "$(date) WARNING: Too many files in queue (>50). Pausing ${RECORDER_SERVICE} and restarting ${ANALYZER_SERVICE}" + log_red "$(date) WARNING: Queue >50. Pausing ${RECORDER_SERVICE} and restarting ${ANALYZER_SERVICE}" apprisealert "Queue >50: ${RECORDER_SERVICE} paused, ${ANALYZER_SERVICE} restarted" sudo systemctl stop "$RECORDER_SERVICE" sudo systemctl restart "$ANALYZER_SERVICE" elif (( wav_count > 30 )); then - log_red "$(date) WARNING: Queue growing (>30). Restarting ${ANALYZER_SERVICE}" + log_red "$(date) WARNING: Queue >30. Restarting ${ANALYZER_SERVICE}" apprisealert "Queue >30: ${ANALYZER_SERVICE} restarted" sudo systemctl restart "$ANALYZER_SERVICE" else - # If the queue is normal, check both services. check_and_restart_service "$RECORDER_SERVICE" check_and_restart_service "$ANALYZER_SERVICE" + apprisealert_recovery fi } @@ -156,11 +176,8 @@ handle_queue() { ######################################## while true; do sleep 61 - - # Check disk space usage first. check_disk_space - # Check the content of the analyzing_now file to see if it has changed. current_file=$(cat "$ANALYZING_NOW_FILE" 2>/dev/null) if [[ "$current_file" == "$analyzing_now" ]]; then (( same_file_counter++ )) @@ -170,17 +187,13 @@ while true; do fi if (( same_file_counter >= SAME_FILE_THRESHOLD )); then - log_yellow "$(date) WARNING: 'analyzing_now' unchanged for ${SAME_FILE_THRESHOLD} iterations, restarting services" + log_yellow "$(date) WARNING: 'analyzing_now' unchanged for ${SAME_FILE_THRESHOLD} iterations" apprisealert "No change in analyzing_now for ${SAME_FILE_THRESHOLD} iterations" "$HOME/BirdNET-Pi/scripts/restart_services.sh" same_file_counter=0 fi - # Count the number of .wav files in the ingest directory. wav_count=$(find -L "$INGEST_DIR" -maxdepth 1 -name '*.wav' | wc -l) log_green "$(date) INFO: ${wav_count} wav files waiting in ${INGEST_DIR}" - - # Handle queue size conditions and service health. handle_queue "$wav_count" - done