From 70111926513226d1ee45d9c6b9dd5cc34ceec626 Mon Sep 17 00:00:00 2001 From: Alexandre <44178713+alexbelgium@users.noreply.github.com> Date: Tue, 4 Feb 2025 20:28:55 +0100 Subject: [PATCH] Update 30-monitoring.sh --- .../rootfs/custom-services.d/30-monitoring.sh | 208 ++++++++---------- 1 file changed, 97 insertions(+), 111 deletions(-) diff --git a/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh b/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh index ea9e00d93..87c1dbd91 100755 --- a/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh +++ b/birdnet-pi/rootfs/custom-services.d/30-monitoring.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # shellcheck shell=bash -# Improved BirdNET-Pi Monitoring Script with Recovery Alerts and Detailed Logs +# Improved BirdNET-Pi Monitoring Script with Recovery Alerts and Condensed Logs HOME="/home/pi" ######################################## -# Logging Functions +# Logging Functions (color-coded for terminal clarity) ######################################## log_green() { echo -e "\033[32m$1\033[0m"; } log_red() { echo -e "\033[31m$1\033[0m"; } @@ -42,9 +42,8 @@ mkdir -p "$INGEST_DIR" || { log_red "Failed to create directory: $INGEST_DIR"; e chown -R pi:pi "$INGEST_DIR" || log_yellow "Could not change ownership for $INGEST_DIR" chmod -R 755 "$INGEST_DIR" || log_yellow "Could not set permissions for $INGEST_DIR" -# Service names -RECORDER_SERVICE="birdnet_recording" -ANALYZER_SERVICE="birdnet_analysis" +# Services to monitor +SERVICES=(birdnet_analysis chart_viewer spectrogram_viewer icecast2 birdnet_recording birdnet_log birdnet_stats) # Notification settings NOTIFICATION_INTERVAL=1800 # seconds (30 minutes) @@ -64,33 +63,29 @@ else fi ######################################## -# Functions +# Notification Functions ######################################## -# Send an issue notification apprisealert() { local issue_message="$1" local current_time current_time=$(date +%s) local time_diff=$(( current_time - last_notification_time )) + # Throttle notifications if (( time_diff < NOTIFICATION_INTERVAL )); then - log_yellow "Notification suppressed (last sent ${time_diff} seconds ago)" + log_yellow "Notification suppressed (last sent ${time_diff} seconds ago)." return fi - local notification="" local stopped_service="
Stopped services: " - - # Check for stopped services - local services=(birdnet_analysis chart_viewer spectrogram_viewer icecast2 birdnet_recording birdnet_log birdnet_stats) - for service in "${services[@]}"; do + for service in "${SERVICES[@]}"; do if [[ "$(systemctl is-active "$service")" != "active" ]]; then stopped_service+="$service; " fi done - notification+="Issue: $issue_message" + local notification="Issue: $issue_message" notification+="$stopped_service" notification+="
System: ${SITE_NAME:-$(hostname)}" notification+="
Available disk space: $(df -h "$HOME/BirdSongs" | awk 'NR==2 {print $4}')" @@ -98,15 +93,15 @@ apprisealert() { local TITLE="BirdNET-Analyzer Alert" if [[ -f "$HOME/BirdNET-Pi/birdnet/bin/apprise" && -s "$HOME/BirdNET-Pi/apprise.txt" ]]; then - "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" + "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" \ + --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" last_notification_time=$current_time - issue_reported=1 # Mark that an issue was reported + issue_reported=1 else log_red "Apprise not configured or missing!" fi } -# Send a "System is back to normal" notification apprisealert_recovery() { # Only send a recovery message if we had previously reported an issue if (( issue_reported == 1 )); then @@ -118,136 +113,127 @@ apprisealert_recovery() { notification+="Available disk space: $(df -h "$HOME/BirdSongs" | awk 'NR==2 {print $4}')" if [[ -f "$HOME/BirdNET-Pi/birdnet/bin/apprise" && -s "$HOME/BirdNET-Pi/apprise.txt" ]]; then - "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" + "$HOME/BirdNET-Pi/birdnet/bin/apprise" -vv -t "$TITLE" -b "$notification" \ + --input-format=html --config="$HOME/BirdNET-Pi/apprise.txt" fi - issue_reported=0 # Reset issue tracker + issue_reported=0 fi } -# Restart a service if inactive -check_and_restart_service() { - local service_name="$1" - local state - state=$(systemctl is-active "$service_name") - - if [[ "$state" != "active" ]]; then - log_yellow "$(date) INFO: Restarting $service_name" - sudo systemctl restart "$service_name" - sleep 61 - state=$(systemctl is-active "$service_name") +######################################## +# Helper Checks +######################################## - if [[ "$state" != "active" ]]; then - log_red "$(date) WARNING: $service_name could not restart" - apprisealert "$service_name cannot restart! Your system seems stuck." - else - log_green "$(date) INFO: $service_name restarted successfully." - fi - else - log_green "$(date) INFO: $service_name is running normally." - fi -} - -# Check disk usage check_disk_space() { local current_usage current_usage=$(df -h "$HOME/BirdSongs" | awk 'NR==2 {print $5}' | sed 's/%//') - + if (( current_usage >= DISK_USAGE_THRESHOLD )); then - log_red "$(date) WARNING: Disk usage is at ${current_usage}%" + log_red "$(date) INFO: Disk usage is at ${current_usage}% (CRITICAL!)" apprisealert "Disk usage critical: ${current_usage}%" - return 1 # Indicate there is an issue + return 1 else + # Example: "Tue Feb 4 20:18:49 CET 2025 INFO: Disk usage is within acceptable limits (30%)." log_green "$(date) INFO: Disk usage is within acceptable limits (${current_usage}%)." + return 0 fi - return 0 # No disk issue } -# Handle queue size -handle_queue() { - local wav_count="$1" - - if (( wav_count > 50 )); then - log_red "$(date) WARNING: Queue >50. Pausing ${RECORDER_SERVICE} and restarting ${ANALYZER_SERVICE}" - apprisealert "Queue >50: ${RECORDER_SERVICE} paused, ${ANALYZER_SERVICE} restarted" - sudo systemctl stop "$RECORDER_SERVICE" - sudo systemctl restart "$ANALYZER_SERVICE" - return 1 - elif (( wav_count > 30 )); then - log_red "$(date) WARNING: Queue >30. Restarting ${ANALYZER_SERVICE}" - apprisealert "Queue >30: ${ANALYZER_SERVICE} restarted" - sudo systemctl restart "$ANALYZER_SERVICE" - return 1 +check_analyzing_now() { + local current_file + current_file=$(cat "$ANALYZING_NOW_FILE" 2>/dev/null) + if [[ "$current_file" == "$analyzing_now" ]]; then + (( same_file_counter++ )) else - # Check if services are alive; attempt restarts if needed - check_and_restart_service "$RECORDER_SERVICE" - check_and_restart_service "$ANALYZER_SERVICE" - log_green "$(date) INFO: Queue is at a manageable level (${wav_count} wav files)." + same_file_counter=0 + analyzing_now="$current_file" fi + if (( same_file_counter >= SAME_FILE_THRESHOLD )); then + log_red "$(date) INFO: 'analyzing_now' file unchanged for $SAME_FILE_THRESHOLD iterations." + apprisealert "No change in analyzing_now for ${SAME_FILE_THRESHOLD} iterations" + "$HOME/BirdNET-Pi/scripts/restart_services.sh" + same_file_counter=0 + return 1 + else + # Only log if it changed this iteration + if (( same_file_counter == 0 )); then + log_green "$(date) INFO: 'analyzing_now' file has been updated." + fi + return 0 + fi +} + +check_queue() { + local wav_count + wav_count=$(find -L "$INGEST_DIR" -maxdepth 1 -name '*.wav' | wc -l) + + # Example: "Tue Feb 4 20:18:50 CET 2025 INFO: Queue is at a manageable level (1 wav files)." + log_info "$(date) INFO: Queue is at a manageable level (${wav_count} wav files)." + + # Below are your existing thresholds/logic. Adjust as needed: + if (( wav_count > 50 )); then + log_red "$(date) INFO: Queue >50. Stopping recorder + restarting analyzer." + apprisealert "Queue exceeded 50: stopping recorder, restarting analyzer." + sudo systemctl stop birdnet_recording + sudo systemctl restart birdnet_analysis + return 1 + elif (( wav_count > 30 )); then + log_red "$(date) INFO: Queue >30. Restarting analyzer." + apprisealert "Queue exceeded 30: restarting analyzer." + sudo systemctl restart birdnet_analysis + return 1 + fi return 0 } +check_services() { + local inactive_services=() + for service in "${SERVICES[@]}"; do + if [[ "$(systemctl is-active "$service")" != "active" ]]; then + inactive_services+=("$service") + fi + done + + if (( ${#inactive_services[@]} == 0 )); then + # Example: "Tue Feb 4 20:18:50 CET 2025 INFO: All services are active" + log_green "$(date) INFO: All services are active" + return 0 + else + log_red "$(date) INFO: Some services are NOT active: ${inactive_services[*]}" + apprisealert "One or more services inactive: ${inactive_services[*]}" + return 1 + fi +} + ######################################## # Main Monitoring Loop ######################################## -iteration=1 + while true; do sleep 61 - log_info "----------------------------------------" - log_info "$(date) INFO: Starting monitoring iteration $iteration" + log_info "$(date) INFO: Starting monitoring check" - # Track whether any issue is found this iteration any_issue=0 - # 1) Check disk space - if ! check_disk_space; then - any_issue=1 - fi + # 1) Disk usage + check_disk_space || any_issue=1 - # 2) Check if analyzing_now file is stuck - current_file=$(cat "$ANALYZING_NOW_FILE" 2>/dev/null) - if [[ "$current_file" == "$analyzing_now" ]]; then - (( same_file_counter++ )) - log_red "$(date) WARNING: 'analyzing_now' file unchanged (${same_file_counter} consecutive iterations)" - else - same_file_counter=0 - analyzing_now="$current_file" - log_green "$(date) INFO: 'analyzing_now' file has been updated." - fi - - if (( same_file_counter >= SAME_FILE_THRESHOLD )); then - log_red "$(date) ERROR: 'analyzing_now' unchanged for ${SAME_FILE_THRESHOLD} iterations" - apprisealert "No change in analyzing_now for ${SAME_FILE_THRESHOLD} iterations" - "$HOME/BirdNET-Pi/scripts/restart_services.sh" - same_file_counter=0 - any_issue=1 - fi + # 2) 'analyzing_now' file + check_analyzing_now || any_issue=1 # 3) Queue check - wav_count=$(find -L "$INGEST_DIR" -maxdepth 1 -name '*.wav' | wc -l) - log_info "$(date) INFO: ${wav_count} wav files waiting in ${INGEST_DIR}" - if ! handle_queue "$wav_count"; then - any_issue=1 - fi + check_queue || any_issue=1 - # 4) Check all essential services are running - services=(birdnet_analysis chart_viewer spectrogram_viewer birdnet_recording birdnet_log birdnet_stats) - for service in "${services[@]}"; do - if [[ "$(systemctl is-active "$service")" != "active" ]]; then - log_red "$(date) ERROR: Service $service is not active!" - any_issue=1 - else - log_green "$(date) INFO: Service $service is active." - fi - done + # 4) Services check + check_services || any_issue=1 - # Summary log for the iteration + # Final summary if (( any_issue == 0 )); then - log_green "$(date) INFO: All systems are functioning normally in iteration $iteration." + # Example: "Tue Feb 4 20:18:50 CET 2025 INFO: All systems are functioning normally" + log_green "$(date) INFO: All systems are functioning normally" apprisealert_recovery else - log_red "$(date) ERROR: Issues detected in iteration $iteration. System status remains degraded." + log_red "$(date) INFO: Issues detected. System status is not fully operational." fi - - iteration=$((iteration+1)) done