#!/bin/bash set -euo pipefail # ====================== 核心配置区域 ====================== # 本地已解压好的node_exporter目录路径 LOCAL_EXTRACT_DIR="/root/node_exporter-1.8.2.linux-amd64" # Node Exporter监听端口 NODE_EXPORTER_PORT="9100" # 外部IP配置文件路径(每行一个主机,格式:IP 或 IP:端口,支持#注释) HOSTS_FILE="./node_exporter_hosts.txt" # 远程登录用户名(批量安装默认) REMOTE_USER="root" # 远程登录密码(批量安装默认,如果使用密钥认证则留空) REMOTE_PASSWORD="hp93000" # 远程临时目录 REMOTE_TMP_DIR="/tmp/node_exporter_install" # SSH连接超时时间(秒) SSH_CONNECT_TIMEOUT=20 # SSH服务器存活检测间隔(秒) SSH_SERVER_ALIVE_INTERVAL=15 # 远程命令执行超时时间(秒) REMOTE_CMD_TIMEOUT=180 # 文件传输最大重试次数 MAX_TRANSFER_RETRIES=3 # ====================== Prometheus配置 ====================== # Prometheus配置文件完整路径 PROMETHEUS_CONFIG_FILE="/root/promethesu/conf/prometheus.yml" # docker-compose.yml文件完整路径 DOCKER_COMPOSE_PATH="/root/promethesu/docker-compose.yml" # 安装/卸载成功后是否自动重启Prometheus AUTO_RESTART_PROMETHEUS=true # 修改配置前是否自动备份Prometheus配置文件 AUTO_BACKUP_CONFIG=true # 添加配置后是否自动检查Prometheus配置语法 AUTO_CHECK_CONFIG=true # ===================================================================== # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # 结果统计 SUCCESS_COUNT=0 FAILURE_COUNT=0 FAILURE_HOSTS=() # Prometheus重启标记 NEED_RESTART_PROMETHEUS=false # 最后一次备份的配置文件路径 LAST_BACKUP_FILE="" # 打印带颜色的信息 info() { echo -e "${GREEN}[INFO]${NC} $1"; } warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } error() { echo -e "${RED}[ERROR]${NC} $1"; } blue() { echo -e "${BLUE}[INFO]${NC} $1"; } # 生成示例IP配置文件 generate_sample_hosts_file() { info "生成示例IP配置文件: $HOSTS_FILE" # 纯echo方式生成示例文件 echo "# Node Exporter批量安装主机列表" > "$HOSTS_FILE" echo "# 格式:每行一个主机,支持以下格式:" >> "$HOSTS_FILE" echo "# 1. 仅IP地址(默认端口22)" >> "$HOSTS_FILE" echo "# 2. IP:端口(指定SSH端口)" >> "$HOSTS_FILE" echo "# 3. #开头的行是注释,会被忽略" >> "$HOSTS_FILE" echo "# 4. 空行会被忽略" >> "$HOSTS_FILE" echo "" >> "$HOSTS_FILE" echo "# 示例:" >> "$HOSTS_FILE" echo "# 192.168.1.10" >> "$HOSTS_FILE" echo "# 192.168.1.11:2222" >> "$HOSTS_FILE" echo "# 192.168.1.12" >> "$HOSTS_FILE" echo "" >> "$HOSTS_FILE" echo "# 请在下方添加您的主机:" >> "$HOSTS_FILE" echo "10.150.10.83" >> "$HOSTS_FILE" echo "10.150.10.86" >> "$HOSTS_FILE" echo "10.150.10.87" >> "$HOSTS_FILE" info "示例IP配置文件已生成" info "请编辑 $HOSTS_FILE 添加您的主机,然后重新运行脚本" exit 0 } # 读取IP配置文件 read_hosts_file() { info "读取IP配置文件: $HOSTS_FILE" if [ ! -f "$HOSTS_FILE" ]; then warn "IP配置文件不存在: $HOSTS_FILE" read -p "是否生成示例配置文件? (y/n): " generate_answer if [[ "$generate_answer" == "y" || "$generate_answer" == "Y" ]]; then generate_sample_hosts_file else error "请创建IP配置文件或修改脚本中的HOSTS_FILE配置项" exit 1 fi fi # 读取文件,过滤空行和注释行 REMOTE_HOSTS=() while IFS= read -r line; do # 跳过空行和注释行 if [[ -z "$line" || "$line" =~ ^# ]]; then continue fi # 去除首尾空格 line=$(echo "$line" | xargs) if [[ -n "$line" ]]; then REMOTE_HOSTS+=("$line") fi done < "$HOSTS_FILE" if [ ${#REMOTE_HOSTS[@]} -eq 0 ]; then error "IP配置文件中没有有效的主机" info "请编辑 $HOSTS_FILE 添加您的主机" exit 1 fi info "成功读取 ${#REMOTE_HOSTS[@]} 个主机" } # 检查本地依赖 check_dependencies() { info "检查本地依赖..." local deps=("scp" "ssh" "tar" "timeout" "stat" "md5sum" "docker-compose") for dep in "${deps[@]}"; do if ! command -v "$dep" &> /dev/null; then if [ "$dep" = "docker-compose" ]; then warn "未找到docker-compose命令,将无法自动重启Prometheus" else error "未找到命令: $dep" exit 1 fi fi done # 检查promtool(用于配置语法检查) if command -v promtool &> /dev/null; then PROMTOOL_AVAILABLE=true info "检测到promtool,将自动检查Prometheus配置语法" else PROMTOOL_AVAILABLE=false warn "未找到promtool,将跳过配置语法检查" info "安装promtool: yum install -y prometheus2" fi # 检查sshpass(密码认证需要,密钥认证不需要) if ! command -v sshpass &> /dev/null; then warn "未找到sshpass命令" info "如果您使用密码认证,请先安装sshpass:" info " CentOS/RHEL 7/8/9: yum install -y epel-release && yum install -y sshpass" info " CentOS/RHEL 6: yum install -y https://dl.fedoraproject.org/pub/e pel/epel-release-latest-6.noarch.rpm && yum install -y sshpass" info "" info "如果您使用SSH密钥认证,则无需安装sshpass" read -p "是否继续执行? (y/n): " continue_answer if [[ "$continue_answer" != "y" && "$continue_answer" != "Y" ]]; then info "退出脚本" exit 0 fi else info "sshpass已安装 (版本: $(sshpass -V 2>&1 | head -1 | awk '{print $2} '))" fi info "所有依赖检查通过" } # 验证本地已解压目录 verify_local_directory() { info "验证本地已解压目录: $LOCAL_EXTRACT_DIR" if [ ! -d "$LOCAL_EXTRACT_DIR" ]; then error "本地目录不存在: $LOCAL_EXTRACT_DIR" info "请将解压好的node_exporter目录放在上述路径,或修改脚本中的LOCAL_EXT RACT_DIR配置项" exit 1 fi if [ ! -f "$LOCAL_EXTRACT_DIR/node_exporter" ]; then error "本地目录中缺少node_exporter二进制文件" info "请确保您的目录结构正确,包含node_exporter可执行文件" exit 1 fi # 计算关键文件MD5用于远程校验 LOCAL_BINARY_MD5=$(md5sum "$LOCAL_EXTRACT_DIR/node_exporter" | awk '{print $ 1}') info "本地二进制文件MD5: $LOCAL_BINARY_MD5" info "本地目录验证通过" } # 验证Prometheus配置文件 verify_prometheus_config() { info "验证Prometheus配置文件..." if [ ! -f "$PROMETHEUS_CONFIG_FILE" ]; then error "Prometheus配置文件不存在: $PROMETHEUS_CONFIG_FILE" info "请修改脚本中的PROMETHEUS_CONFIG_FILE配置项为正确路径" exit 1 fi if [ ! -f "$DOCKER_COMPOSE_PATH" ]; then error "docker-compose文件不存在: $DOCKER_COMPOSE_PATH" info "请修改脚本中的DOCKER_COMPOSE_PATH配置项为正确路径" exit 1 fi # 检查配置文件中是否包含scrape_configs部分 if ! grep -q "^scrape_configs:" "$PROMETHEUS_CONFIG_FILE"; then error "Prometheus配置文件中未找到scrape_configs部分" exit 1 fi # 检查配置文件是否可写 if [ ! -w "$PROMETHEUS_CONFIG_FILE" ]; then error "Prometheus配置文件不可写: $PROMETHEUS_CONFIG_FILE" info "请检查文件权限" exit 1 fi # 检查文件末尾是否有换行符 if [ -n "$(tail -c1 "$PROMETHEUS_CONFIG_FILE")" ]; then warn "检测到配置文件末尾没有换行符,将自动修复" echo "" >> "$PROMETHEUS_CONFIG_FILE" info "已添加换行符到配置文件末尾" fi info "Prometheus配置文件验证通过" } # 备份Prometheus配置文件 backup_prometheus_config() { if [ "$AUTO_BACKUP_CONFIG" != true ]; then return fi LAST_BACKUP_FILE="${PROMETHEUS_CONFIG_FILE}.bak.$(date +%Y%m%d_%H%M%S)" info "备份Prometheus配置文件到: $LAST_BACKUP_FILE" if cp "$PROMETHEUS_CONFIG_FILE" "$LAST_BACKUP_FILE"; then info "配置文件备份成功" else warn "配置文件备份失败,继续执行..." LAST_BACKUP_FILE="" fi } # 回滚到上一次备份的配置文件 rollback_prometheus_config() { if [ -z "$LAST_BACKUP_FILE" ] || [ ! -f "$LAST_BACKUP_FILE" ]; then error "没有可用的备份文件,无法回滚" return 1 fi warn "检测到配置错误,正在回滚到备份文件: $LAST_BACKUP_FILE" if cp "$LAST_BACKUP_FILE" "$PROMETHEUS_CONFIG_FILE"; then info "配置文件已回滚" return 0 else error "回滚失败" return 1 fi } # 检查Prometheus配置语法 check_prometheus_config_syntax() { if [ "$AUTO_CHECK_CONFIG" != true ] || [ "$PROMTOOL_AVAILABLE" != true ]; th en return 0 fi info "检查Prometheus配置语法..." if promtool check config "$PROMETHEUS_CONFIG_FILE" &> /dev/null; then info "Prometheus配置语法正确" return 0 else error "Prometheus配置语法错误" promtool check config "$PROMETHEUS_CONFIG_FILE" return 1 fi } # ============================================== # 纯echo追加版配置添加函数(核心修改) # ============================================== update_prometheus_config() { local hostname="$1" local ip="$2" local target="${ip}:${NODE_EXPORTER_PORT}" info "更新Prometheus配置文件(纯echo追加方式)..." # 检查是否已经存在相同的job_name if grep -q "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE"; then warn "Prometheus配置中已存在job_name: $hostname,跳过添加" return 0 fi # 检查是否已经存在相同的target if grep -q "targets: \[\"$target\"\]" "$PROMETHEUS_CONFIG_FILE"; then warn "Prometheus配置中已存在target: $target,跳过添加" return 0 fi # 备份配置文件 backup_prometheus_config # 确保文件末尾有换行符(关键前置步骤) if [ -n "$(tail -c1 "$PROMETHEUS_CONFIG_FILE")" ]; then warn "检测到配置文件末尾没有换行符,自动添加" echo "" >> "$PROMETHEUS_CONFIG_FILE" fi # 纯echo逐行追加 - 最可靠的YAML写入方式 info "开始逐行添加配置..." # 第一行:job_name(2个空格缩进) echo " - job_name: \"$hostname\"" >> "$PROMETHEUS_CONFIG_FILE" if grep -q "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE"; then info " ✅ 第1行写入成功: job_name: \"$hostname\"" else error " ❌ 第1行写入失败" return 1 fi # 第二行:static_configs(4个空格缩进) echo " static_configs:" >> "$PROMETHEUS_CONFIG_FILE" if grep -A1 "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE" | grep -q "s tatic_configs:"; then info " ✅ 第2行写入成功: static_configs:" else error " ❌ 第2行写入失败" # 回滚:删除刚才添加的第一行 sed -i "/job_name: \"$hostname\"/d" "$PROMETHEUS_CONFIG_FILE" return 1 fi # 第三行:targets(6个空格缩进) echo " - targets: [\"$target\"]" >> "$PROMETHEUS_CONFIG_FILE" if grep -A2 "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE" | grep -q "t argets: \[\"$target\"\]"; then info " ✅ 第3行写入成功: targets: [\"$target\"]" else error " ❌ 第3行写入失败" # 回滚:删除刚才添加的前两行 sed -i "/job_name: \"$hostname\"/{N;d;}" "$PROMETHEUS_CONFIG_FILE" return 1 fi info "✅ 所有配置行写入成功!" info "已添加的完整配置块:" info " - job_name: \"$hostname\"" info " static_configs:" info " - targets: [\"$target\"]" # 检查配置语法 if check_prometheus_config_syntax; then NEED_RESTART_PROMETHEUS=true return 0 else # 语法错误,完整回滚 rollback_prometheus_config return 1 fi } # 从Prometheus配置中移除监控节点(也使用纯echo方式重建) remove_from_prometheus_config() { local identifier="$1" # 可以是hostname或ip:port info "从Prometheus配置中移除监控节点: $identifier" # 备份配置文件 backup_prometheus_config # 尝试按job_name移除 if grep -q "job_name: \"$identifier\"" "$PROMETHEUS_CONFIG_FILE"; then info "找到job_name: $identifier,正在移除..." # 使用sed删除对应的job块(匹配job_name行及其后两行) sed -i "/job_name: \"$identifier\"/{N;N;d;}" "$PROMETHEUS_CONFIG_FILE" # 检查是否移除成功 if ! grep -q "job_name: \"$identifier\"" "$PROMETHEUS_CONFIG_FILE"; then info "✅ 已从Prometheus配置中移除job: $identifier" # 检查配置语法 if check_prometheus_config_syntax; then NEED_RESTART_PROMETHEUS=true return 0 else rollback_prometheus_config return 1 fi fi fi # 尝试按target移除 local target_pattern="targets: \[\"$identifier\"\]" if grep -q "$target_pattern" "$PROMETHEUS_CONFIG_FILE"; then info "找到target: $identifier,正在移除..." # 使用sed删除对应的job块(向上查找两行找到job_name,然后删除三行) sed -i "/$target_pattern/{N;N;N;d;}" "$PROMETHEUS_CONFIG_FILE" # 检查是否移除成功 if ! grep -q "$target_pattern" "$PROMETHEUS_CONFIG_FILE"; then info "✅ 已从Prometheus配置中移除target: $identifier" # 检查配置语法 if check_prometheus_config_syntax; then NEED_RESTART_PROMETHEUS=true return 0 else rollback_prometheus_config return 1 fi fi fi warn "在Prometheus配置中未找到节点: $identifier" return 1 } # 重启Prometheus服务 restart_prometheus() { if [ "$AUTO_RESTART_PROMETHEUS" != true ] || [ "$NEED_RESTART_PROMETHEUS" != true ]; then return fi info "重启Prometheus服务使配置生效..." # 切换到docker-compose所在目录 local compose_dir=$(dirname "$DOCKER_COMPOSE_PATH") cd "$compose_dir" if docker-compose restart prometheus; then info "✅ Prometheus重启成功,新配置已生效" # 等待Prometheus启动完成 sleep 10 # 检查Prometheus是否正常运行 if docker-compose ps | grep -q "prometheus.*Up"; then info "Prometheus服务运行正常" else warn "Prometheus服务可能未正常启动,请手动检查" fi else error "Prometheus重启失败,请手动执行以下命令检查:" error " cd $compose_dir" error " docker-compose logs -f prometheus" fi } # 远程执行命令 remote_exec() { local host="$1" local port="$2" local user="$3" local password="$4" local cmd="$5" # SSH通用选项:完全静默警告、禁用主机密钥检查、设置超时 local ssh_options="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/nu ll" ssh_options="$ssh_options -o LogLevel=ERROR" ssh_options="$ssh_options -o ConnectTimeout=$SSH_CONNECT_TIMEOUT" ssh_options="$ssh_options -o ServerAliveInterval=$SSH_SERVER_ALIVE_INTERVAL" ssh_options="$ssh_options -o ServerAliveCountMax=3" ssh_options="$ssh_options -o TCPKeepAlive=yes" # 使用timeout命令防止远程执行无限卡住 local exit_code=0 if [ -n "$password" ]; then # 使用环境变量传递密码,避免特殊字符问题 SSHPASS="$password" sshpass -e ssh $ssh_options -p "$port" "${user}@${ho st}" "$cmd" exit_code=$? else # 使用密钥认证 ssh $ssh_options -p "$port" "${user}@${host}" "$cmd" exit_code=$? fi # 正确的错误代码解释 if [ $exit_code -ne 0 ]; then if [ $exit_code -eq 124 ]; then error "远程命令执行超时(超过${REMOTE_CMD_TIMEOUT}秒)" elif [ $exit_code -eq 255 ]; then error "SSH连接失败:网络问题或远程主机不可达" else # 1-254都是远程命令本身的退出代码 error "远程命令执行失败,退出代码: $exit_code" fi fi return $exit_code } # 远程复制目录(增强版,带完整性校验和重试机制) remote_copy_dir_with_verify() { local host="$1" local port="$2" local user="$3" local password="$4" local src_dir="$5" local dest_dir="$6" local expected_md5="$7" # SCP通用选项:完全静默警告 local scp_options="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/nu ll" scp_options="$scp_options -o LogLevel=ERROR" scp_options="$scp_options -o ConnectTimeout=$SSH_CONNECT_TIMEOUT" scp_options="$scp_options -o ServerAliveInterval=$SSH_SERVER_ALIVE_INTERVAL" scp_options="$scp_options -o ServerAliveCountMax=3" scp_options="$scp_options -o TCPKeepAlive=yes" local retry_count=0 local success=0 while [ $retry_count -lt $MAX_TRANSFER_RETRIES ]; do if [ $retry_count -gt 0 ]; then warn "目录传输重试 $retry_count/$MAX_TRANSFER_RETRIES..." # 先删除可能不完整的远程目录 remote_exec "$host" "$port" "$user" "$password" "rm -rf '$dest_dir'" &> /dev/null sleep 2 fi info "正在传输目录: $src_dir -> $dest_dir" if [ -n "$password" ]; then SSHPASS="$password" sshpass -e scp -r $scp_options -P "$port" "$src_ dir"/* "${user}@${host}:${dest_dir}/" else scp -r $scp_options -P "$port" "$src_dir"/* "${user}@${host}:${dest_ dir}/" fi if [ $? -eq 0 ]; then # 验证远程二进制文件MD5 info "验证远程文件完整性..." local remote_md5=$(remote_exec "$host" "$port" "$user" "$password" " md5sum '$dest_dir/node_exporter' 2>/dev/null | cut -d' ' -f1 || echo 'ERROR_MD5_ FAILED'") if [ "$remote_md5" = "$expected_md5" ]; then info "✅ 目录传输成功,MD5校验通过: $remote_md5" success=1 break else error "MD5校验不匹配!本地: $expected_md5,远程: $remote_md5" fi else error "目录传输失败" fi ((retry_count++)) done if [ $success -eq 0 ]; then error "目录传输失败,已重试 $MAX_TRANSFER_RETRIES 次" return 1 fi return 0 } # SSH连接诊断函数 diagnose_ssh_connection() { local host="$1" local port="$2" local user="$3" local password="$4" info "开始SSH连接诊断..." # 测试网络连通性 info "1. 测试网络连通性..." if ping -c 5 -W 2 "$host" &> /dev/null; then info " ✓ 网络连通性正常" else error " ✗ 网络连通性失败,无法ping通主机" fi # 测试端口是否开放 info "2. 测试SSH端口 $port 是否开放..." if command -v nc &> /dev/null; then if nc -z -w 10 "$host" "$port"; then info " ✓ SSH端口 $port 开放" else error " ✗ SSH端口 $port 关闭或被防火墙阻止" fi else warn " 未找到nc命令,跳过端口测试" fi # 测试SSH服务版本 info "3. 测试SSH服务响应..." if timeout 10 bash -c "echo '' | telnet $host $port 2>&1 | grep -i 'ssh'" &> /dev/null; then info " ✓ SSH服务正在运行" else error " ✗ SSH服务未响应" fi # 测试身份验证 info "4. 测试身份验证..." if [ -n "$password" ]; then info " 使用密码认证方式" SSHPASS="$password" sshpass -e ssh -o StrictHostKeyChecking=no -o LogLev el=ERROR -o ConnectTimeout=10 -p "$port" "${user}@${host}" "echo '认证成功'" 2>& 1 else info " 使用密钥认证方式" ssh -o StrictHostKeyChecking=no -o LogLevel=ERROR -o ConnectTimeout=10 - p "$port" "${user}@${host}" "echo '认证成功'" 2>&1 fi if [ $? -eq 0 ]; then info " ✓ 身份验证成功" else error " ✗ 身份验证失败" info "" info "常见解决方案:" info " 1. 检查用户名和密码是否正确" info " 2. 确认远程主机允许密码认证(PasswordAuthentication yes)" info " 3. 确认远程主机允许root登录(PermitRootLogin yes)" info " 4. 检查远程主机的防火墙和SELinux设置" info " 5. 查看远程主机的SSH日志:tail -f /var/log/secure" fi # 测试远程磁盘空间(使用cut代替awk) info "5. 测试远程磁盘空间..." local disk_space=$(remote_exec "$host" "$port" "$user" "$password" "df -P /t mp | tail -1 | cut -d' ' -f4") if [ -n "$disk_space" ] && [ "$disk_space" -gt 10240 ]; then info " ✓ 远程/tmp目录可用空间: $(($disk_space / 1024)) MB" else error " ✗ 远程/tmp目录空间不足或无法访问" fi info "诊断完成" } # 在单个主机上安装Node Exporter install_on_host() { local host_port="$1" local user="$2" local password="$3" local host="${host_port%:*}" local port="22" # 提取端口号 if [[ "$host_port" == *:* ]]; then port="${host_port#*:}" fi info "=============================================" info "开始处理主机: ${host}:${port}" # 测试SSH连接 info "测试SSH连接..." if ! remote_exec "$host" "$port" "$user" "$password" "echo 'SSH连接成功'" &> /dev/null; then error "无法连接到主机 ${host}:${port}" # 询问是否进行诊断 read -p "是否进行SSH连接诊断? (y/n): " diagnose_answer if [[ "$diagnose_answer" == "y" || "$diagnose_answer" == "Y" ]]; then diagnose_ssh_connection "$host" "$port" "$user" "$password" fi ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 fi info "✅ SSH连接成功" # 获取系统版本信息 info "检测系统版本..." local os_release os_release=$(remote_exec "$host" "$port" "$user" "$password" "cat /etc/redha t-release 2>/dev/null || cat /etc/issue 2>/dev/null | head -1") || { error "无法获取系统版本信息" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 } info "检测到系统: $os_release" # 确定系统类型和服务管理方式 local system_type="unknown" local service_manager="unknown" if echo "$os_release" | grep -qiE "centos.*6|red.*hat.*6|rhel.*6"; then system_type="rhel6" service_manager="sysvinit" elif echo "$os_release" | grep -qiE "centos.*5|red.*hat.*5|rhel.*5"; then system_type="rhel5" service_manager="sysvinit" elif echo "$os_release" | grep -qiE "centos.*7|red.*hat.*7|rhel.*7"; then system_type="rhel7" service_manager="systemd" elif echo "$os_release" | grep -qiE "red.*hat.*9|rhel.*9"; then system_type="rhel9" service_manager="systemd" else warn "不支持的系统类型,尝试使用通用方式安装" # 尝试检测服务管理器 if remote_exec "$host" "$port" "$user" "$password" "command -v systemctl &> /dev/null"; then service_manager="systemd" else service_manager="sysvinit" fi fi info "系统类型: $system_type, 服务管理器: $service_manager" # 清理并创建远程临时目录 info "清理并创建远程临时目录..." remote_exec "$host" "$port" "$user" "$password" "rm -rf $REMOTE_TMP_DIR && m kdir -p $REMOTE_TMP_DIR" # 传输本地已解压目录到远程主机(带MD5完整性校验) info "传输安装文件到远程主机..." if ! remote_copy_dir_with_verify "$host" "$port" "$user" "$password" "$LOCAL _EXTRACT_DIR" "$REMOTE_TMP_DIR" "$LOCAL_BINARY_MD5"; then error "传输安装文件失败" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 fi # 远程执行安装脚本 info "开始远程安装..." local install_script=$(cat << EOF #!/bin/bash set -euo pipefail cd "$REMOTE_TMP_DIR" # 验证文件完整性 echo "验证远程文件完整性..." if ! md5sum "node_exporter" | cut -d' ' -f1 | grep -q "$LOCAL_BINARY_MD5"; then echo "ERROR: 远程文件MD5校验失败!" echo "本地MD5: $LOCAL_BINARY_MD5" echo "远程MD5: \$(md5sum "node_exporter" | cut -d' ' -f1)" exit 1 fi echo "文件完整性验证通过" # 创建node_exporter用户(如果不存在) if ! id node_exporter &> /dev/null; then useradd -M -s /sbin/nologin node_exporter 2>/dev/null || adduser -M -s /sbin /nologin node_exporter fi # 安装二进制文件 cp node_exporter /usr/local/bin/ chmod +x /usr/local/bin/node_exporter chown node_exporter:node_exporter /usr/local/bin/node_exporter # 创建数据目录 mkdir -p /var/lib/node_exporter chown node_exporter:node_exporter /var/lib/node_exporter # RHEL9专属优化:配置SELinux允许node_exporter运行 if [ "$system_type" = "rhel9" ]; then echo "配置RHEL9 SELinux规则..." # 允许node_exporter绑定到任何端口 semanage port -a -t http_port_t -p tcp ${NODE_EXPORTER_PORT} 2>/dev/null || true # 允许node_exporter读取系统信息 setsebool -P domain_can_mmap_files 1 2>/dev/null || true fi # 安装服务文件 if [ "$service_manager" = "systemd" ]; then # Systemd服务文件 cat > /etc/systemd/system/node_exporter.service << 'SERVICE_EOF' [Unit] Description=Prometheus Node Exporter After=network.target [Service] User=node_exporter Group=node_exporter Type=simple ExecStart=/usr/local/bin/node_exporter \ --collector.systemd \ --collector.processes \ --collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc|run|va r/lib/docker)($|/)" \ --collector.cpu.info \ --collector.meminfo \ --collector.loadavg \ --collector.diskstats \ --collector.netdev \ --web.listen-address=:${NODE_EXPORTER_PORT} Restart=always RestartSec=5 Delegate=yes ProtectSystem=strict ProtectHome=yes PrivateTmp=yes ProtectKernelTunables=no ProtectControlGroups=no [Install] WantedBy=multi-user.target SERVICE_EOF systemctl daemon-reload systemctl enable node_exporter systemctl start node_exporter else # SysVinit服务文件 cat > /etc/init.d/node_exporter << 'SERVICE_EOF' #!/bin/bash # chkconfig: 2345 90 10 # description: Prometheus Node Exporter NAME="node_exporter" DAEMON="/usr/local/bin/\${NAME}" PIDFILE="/var/run/\${NAME}.pid" USER="node_exporter" OPTIONS="--collector.systemd --collector.processes --collector.filesystem.ignore d-mount-points=\"^/(sys|proc|dev|host|etc)(\\\$|/)\" --web.listen-address=:${NOD E_EXPORTER_PORT}" start() { if [ -f "\$PIDFILE" ]; then echo "\$NAME is already running" exit 1 fi echo "Starting \$NAME..." su -s /bin/sh \$USER -c "\$DAEMON \$OPTIONS &" echo \$! > "\$PIDFILE" echo "\$NAME started" } stop() { if [ ! -f "\$PIDFILE" ]; then echo "\$NAME is not running" exit 1 fi echo "Stopping \$NAME..." kill \$(cat "\$PIDFILE") rm -f "\$PIDFILE" echo "\$NAME stopped" } status() { if [ -f "\$PIDFILE" ]; then echo "\$NAME is running (PID: \$(cat "\$PIDFILE"))" else echo "\$NAME is not running" fi } restart() { stop sleep 2 start } case "\$1" in start) start ;; stop) stop ;; status) status ;; restart) restart ;; *) echo "Usage: \$0 {start|stop|status|restart}"; exit 1 ;; esac SERVICE_EOF chmod +x /etc/init.d/node_exporter chkconfig --add node_exporter 2>/dev/null || true chkconfig node_exporter on 2>/dev/null || true service node_exporter start fi # 配置防火墙 if [ "$system_type" = "rhel5" ] || [ "$system_type" = "rhel6" ]; then if command -v iptables &> /dev/null; then iptables -I INPUT -p tcp --dport ${NODE_EXPORTER_PORT} -j ACCEPT if [ -f /etc/sysconfig/iptables ]; then service iptables save 2>/dev/null || true fi fi elif [ "$system_type" = "rhel7" ] || [ "$system_type" = "rhel9" ]; then if command -v firewall-cmd &> /dev/null; then firewall-cmd --permanent --add-port=${NODE_EXPORTER_PORT}/tcp 2>/dev/nul l || true firewall-cmd --reload 2>/dev/null || true fi fi # 验证安装 echo "等待服务启动..." sleep 5 for i in 1 2 3; do echo "验证尝试 \$i/3..." if command -v curl &> /dev/null; then if curl -s --connect-timeout 5 http://localhost:${NODE_EXPORTER_PORT}/me trics &> /dev/null; then echo "SUCCESS: Node Exporter安装成功并正在运行" exit 0 fi elif command -v wget &> /dev/null; then if wget -q -T 5 -O /dev/null http://localhost:${NODE_EXPORTER_PORT}/metr ics; then echo "SUCCESS: Node Exporter安装成功并正在运行" exit 0 fi else if ps aux | grep -v grep | grep node_exporter &> /dev/null; then echo "SUCCESS: Node Exporter进程已启动" exit 0 fi fi sleep 3 done echo "ERROR: Node Exporter安装失败" systemctl status node_exporter 2>/dev/null || service node_exporter status 2>/de v/null exit 1 EOF ) # 将安装脚本写入远程主机并执行 echo "$install_script" > "/tmp/install_remote.sh" # 传输安装脚本 if ! remote_exec "$host" "$port" "$user" "$password" "cat > '$REMOTE_TMP_DIR /install_remote.sh'" < "/tmp/install_remote.sh"; then error "传输安装脚本失败" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 fi remote_exec "$host" "$port" "$user" "$password" "chmod +x $REMOTE_TMP_DIR/in stall_remote.sh" info "执行远程安装脚本..." if remote_exec "$host" "$port" "$user" "$password" "$REMOTE_TMP_DIR/install_ remote.sh"; then info "✅ 主机 ${host}:${port} 安装成功" ((SUCCESS_COUNT++)) # 获取远程主机名并更新Prometheus配置 info "获取远程主机名..." local remote_hostname=$(remote_exec "$host" "$port" "$user" "$password" "hostname -s 2>/dev/null || hostname") if [ -z "$remote_hostname" ] || [ "$remote_hostname" = "ERROR_MD5_FAILED " ]; then warn "无法获取远程主机名,使用IP作为job_name" remote_hostname="$host" fi info "远程主机名: $remote_hostname" # 更新Prometheus配置(纯echo方式) update_prometheus_config "$remote_hostname" "$host" else error "主机 ${host}:${port} 安装失败" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") fi # 清理远程临时文件 info "清理远程临时文件..." remote_exec "$host" "$port" "$user" "$password" "rm -rf $REMOTE_TMP_DIR" &> /dev/null || true return 0 } # 在单个主机上卸载Node Exporter uninstall_on_host() { local host_port="$1" local user="$2" local password="$3" local host="${host_port%:*}" local port="22" # 提取端口号 if [[ "$host_port" == *:* ]]; then port="${host_port#*:}" fi info "=============================================" info "开始卸载主机: ${host}:${port}" # 测试SSH连接 info "测试SSH连接..." if ! remote_exec "$host" "$port" "$user" "$password" "echo 'SSH连接成功'" &> /dev/null; then error "无法连接到主机 ${host}:${port}" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 fi info "✅ SSH连接成功" # 获取远程主机名用于移除Prometheus配置 info "获取远程主机名..." local remote_hostname=$(remote_exec "$host" "$port" "$user" "$password" "hos tname -s 2>/dev/null || hostname") if [ -z "$remote_hostname" ] || [ "$remote_hostname" = "ERROR_MD5_FAILED" ]; then warn "无法获取远程主机名,将使用IP:PORT从Prometheus配置中移除" remote_hostname="${host}:${NODE_EXPORTER_PORT}" fi info "远程主机名: $remote_hostname" # 远程执行卸载脚本 info "开始远程卸载..." local uninstall_script=$(cat << EOF #!/bin/bash set -euo pipefail echo "停止node_exporter服务..." if command -v systemctl &> /dev/null; then systemctl stop node_exporter 2>/dev/null || true systemctl disable node_exporter 2>/dev/null || true rm -f /etc/systemd/system/node_exporter.service systemctl daemon-reload else service node_exporter stop 2>/dev/null || true chkconfig node_exporter off 2>/dev/null || true chkconfig --del node_exporter 2>/dev/null || true rm -f /etc/init.d/node_exporter fi echo "删除node_exporter二进制文件..." rm -f /usr/local/bin/node_exporter echo "删除node_exporter用户和数据目录..." userdel node_exporter 2>/dev/null || true rm -rf /var/lib/node_exporter echo "删除防火墙规则..." if command -v firewall-cmd &> /dev/null; then firewall-cmd --permanent --remove-port=${NODE_EXPORTER_PORT}/tcp 2>/dev/null || true firewall-cmd --reload 2>/dev/null || true elif command -v iptables &> /dev/null; then iptables -D INPUT -p tcp --dport ${NODE_EXPORTER_PORT} -j ACCEPT 2>/dev/null || true if [ -f /etc/sysconfig/iptables ]; then service iptables save 2>/dev/null || true fi fi echo "SUCCESS: Node Exporter卸载成功" exit 0 EOF ) # 将卸载脚本写入远程主机并执行 echo "$uninstall_script" > "/tmp/uninstall_remote.sh" # 传输卸载脚本 if ! remote_exec "$host" "$port" "$user" "$password" "cat > '/tmp/uninstall_ remote.sh'" < "/tmp/uninstall_remote.sh"; then error "传输卸载脚本失败" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") return 1 fi remote_exec "$host" "$port" "$user" "$password" "chmod +x /tmp/uninstall_rem ote.sh" info "执行远程卸载脚本..." if remote_exec "$host" "$port" "$user" "$password" "/tmp/uninstall_remote.sh "; then info "✅ 主机 ${host}:${port} 卸载成功" ((SUCCESS_COUNT++)) # 从Prometheus配置中移除该节点 remove_from_prometheus_config "$remote_hostname" if [ $? -ne 0 ]; then remove_from_prometheus_config "${host}:${NODE_EXPORTER_PORT}" fi else error "主机 ${host}:${port} 卸载失败" ((FAILURE_COUNT++)) FAILURE_HOSTS+=("$host_port") fi # 清理远程临时文件 remote_exec "$host" "$port" "$user" "$password" "rm -f /tmp/uninstall_remote .sh" &> /dev/null || true return 0 } # 批量安装模式 batch_install() { read_hosts_file info "进入批量安装模式" info "将安装到以下 ${#REMOTE_HOSTS[@]} 个主机:" for host in "${REMOTE_HOSTS[@]}"; do blue " - $host" done read -p "确认开始批量安装? (y/n): " confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then info "取消批量安装" return fi # 遍历所有主机进行安装 for host_port in "${REMOTE_HOSTS[@]}"; do install_on_host "$host_port" "$REMOTE_USER" "$REMOTE_PASSWORD" done } # 批量卸载模式 batch_uninstall() { read_hosts_file info "进入批量卸载模式" info "将从以下 ${#REMOTE_HOSTS[@]} 个主机卸载:" for host in "${REMOTE_HOSTS[@]}"; do blue " - $host" done read -p "确认开始批量卸载? (y/n): " confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then info "取消批量卸载" return fi # 遍历所有主机进行卸载 for host_port in "${REMOTE_HOSTS[@]}"; do uninstall_on_host "$host_port" "$REMOTE_USER" "$REMOTE_PASSWORD" done } # 指定单主机安装模式 single_install() { info "进入指定单主机安装模式" read -p "请输入目标主机IP地址: " target_host read -p "请输入SSH端口号(默认22): " target_port target_port=${target_port:-22} read -p "请输入登录用户名(默认root): " target_user target_user=${target_user:-root} read -s -p "请输入登录密码(留空使用密钥认证): " target_password echo "" info "您输入的信息:" blue " 主机: ${target_host}:${target_port}" blue " 用户: ${target_user}" if [ -z "$target_password" ]; then blue " 认证方式: SSH密钥认证" else blue " 认证方式: 密码认证" fi read -p "确认信息正确并开始安装? (y/n): " confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then info "取消安装" return fi install_on_host "${target_host}:${target_port}" "$target_user" "$target_pass word" } # 指定单主机卸载模式 single_uninstall() { info "进入指定单主机卸载模式" read -p "请输入目标主机IP地址: " target_host read -p "请输入SSH端口号(默认22): " target_port target_port=${target_port:-22} read -p "请输入登录用户名(默认root): " target_user target_user=${target_user:-root} read -s -p "请输入登录密码(留空使用密钥认证): " target_password echo "" info "您输入的信息:" blue " 主机: ${target_host}:${target_port}" blue " 用户: ${target_user}" if [ -z "$target_password" ]; then blue " 认证方式: SSH密钥认证" else blue " 认证方式: 密码认证" fi read -p "确认信息正确并开始卸载? (y/n): " confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then info "取消卸载" return fi uninstall_on_host "${target_host}:${target_port}" "$target_user" "$target_pa ssword" } # 手动添加Prometheus监控节点(纯echo方式) manual_add_prometheus() { info "进入手动添加Prometheus监控节点模式" read -p "请输入job_name: " job_name read -p "请输入target (格式: IP:端口): " target update_prometheus_config "$job_name" "${target%:*}" # 重启Prometheus restart_prometheus info "✅ 手动添加完成" } # 显示操作结果汇总 show_results() { info "=============================================" info "操作结果汇总:" info "总主机数: $((SUCCESS_COUNT + FAILURE_COUNT))" info "成功: ${SUCCESS_COUNT}" info "失败: ${FAILURE_COUNT}" if [ ${FAILURE_COUNT} -gt 0 ]; then error "失败的主机:" for host in "${FAILURE_HOSTS[@]}"; do error " - $host" done else info "✅ 所有主机操作成功!" fi if [ "$NEED_RESTART_PROMETHEUS" = true ]; then info "" info "Prometheus配置已更新" fi } # 主函数 main() { info "Node Exporter远程管理脚本(纯echo追加版)" info "使用本地已解压目录: $LOCAL_EXTRACT_DIR" info "IP配置文件: $HOSTS_FILE" info "Prometheus配置添加方式: 纯echo逐行追加" info "=============================================" # 验证Prometheus配置 verify_prometheus_config # 显示操作选择菜单 echo "" blue "请选择操作:" echo " 1. 批量安装(从IP配置文件读取主机列表)" echo " 2. 指定安装(手动输入单台主机信息)" echo " 3. 批量卸载(从IP配置文件读取主机列表)" echo " 4. 指定卸载(手动输入单台主机信息)" echo " 5. 手动添加Prometheus监控节点" echo " 6. 生成示例IP配置文件" echo " 7. SSH连接诊断工具" echo " 8. 重新验证本地目录和Prometheus配置" echo " 9. 退出" echo "" read -p "请输入选项(1-9): " operation_mode case "$operation_mode" in 1) verify_local_directory batch_install ;; 2) verify_local_directory single_install ;; 3) batch_uninstall ;; 4) single_uninstall ;; 5) manual_add_prometheus exit 0 ;; 6) generate_sample_hosts_file ;; 7) info "进入SSH连接诊断工具" read -p "请输入目标主机IP地址: " diag_host read -p "请输入SSH端口号(默认22): " diag_port diag_port=${diag_port:-22} read -p "请输入登录用户名(默认root): " diag_user diag_user=${diag_user:-root} read -s -p "请输入登录密码(留空使用密钥认证): " diag_password echo "" diagnose_ssh_connection "$diag_host" "$diag_port" "$diag_user" "$dia g_password" exit 0 ;; 8) info "重新验证本地目录和Prometheus配置..." verify_local_directory verify_prometheus_config info "✅ 所有验证完成,请重新运行脚本" exit 0 ;; 9) info "退出脚本" exit 0 ;; *) error "无效的选项" exit 1 ;; esac # 显示结果 show_results # 重启Prometheus(如果需要) restart_prometheus # 清理本地临时文件 info "清理本地临时文件..." rm -f "/tmp/install_remote.sh" "/tmp/uninstall_remote.sh" info "✅ 脚本执行完成" } # 运行主函数 check_dependencies main "$@"