Files
node_exporter_install/install_node_exporter.sh
T
2026-05-26 10:25:47 +08:00

1297 lines
50 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
set -euo pipefail
# ====================== 核心配置区域 ======================
# 本地已解压好的node_exporter目录路径
LOCAL_EXTRACT_DIR="/root/node_exporter-1.8.2.linux-amd64"
# Node Exporter监听端口
NODE_EXPORTER_PORT="9100"
# 外部IP配置文件路径(每行一个主机,格式:IP 或 IP:端口,支持#注释)
HOSTS_FILE="./node_exporter_hosts.txt"
# 远程登录用户名(批量安装默认)
REMOTE_USER="root"
# 远程登录密码(批量安装默认,如果使用密钥认证则留空)
REMOTE_PASSWORD="hp93000"
# 远程临时目录
REMOTE_TMP_DIR="/tmp/node_exporter_install"
# SSH连接超时时间(秒)
SSH_CONNECT_TIMEOUT=20
# SSH服务器存活检测间隔(秒)
SSH_SERVER_ALIVE_INTERVAL=15
# 远程命令执行超时时间(秒)
REMOTE_CMD_TIMEOUT=180
# 文件传输最大重试次数
MAX_TRANSFER_RETRIES=3
# ====================== Prometheus配置 ======================
# Prometheus配置文件完整路径
PROMETHEUS_CONFIG_FILE="/root/promethesu/conf/prometheus.yml"
# docker-compose.yml文件完整路径
DOCKER_COMPOSE_PATH="/root/promethesu/docker-compose.yml"
# 安装/卸载成功后是否自动重启Prometheus
AUTO_RESTART_PROMETHEUS=true
# 修改配置前是否自动备份Prometheus配置文件
AUTO_BACKUP_CONFIG=true
# 添加配置后是否自动检查Prometheus配置语法
AUTO_CHECK_CONFIG=true
# =====================================================================
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 结果统计
SUCCESS_COUNT=0
FAILURE_COUNT=0
FAILURE_HOSTS=()
# Prometheus重启标记
NEED_RESTART_PROMETHEUS=false
# 最后一次备份的配置文件路径
LAST_BACKUP_FILE=""
# 打印带颜色的信息
info() { echo -e "${GREEN}[INFO]${NC} $1"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
error() { echo -e "${RED}[ERROR]${NC} $1"; }
blue() { echo -e "${BLUE}[INFO]${NC} $1"; }
# 生成示例IP配置文件
generate_sample_hosts_file() {
info "生成示例IP配置文件: $HOSTS_FILE"
# 纯echo方式生成示例文件
echo "# Node Exporter批量安装主机列表" > "$HOSTS_FILE"
echo "# 格式:每行一个主机,支持以下格式:" >> "$HOSTS_FILE"
echo "# 1. 仅IP地址(默认端口22" >> "$HOSTS_FILE"
echo "# 2. IP:端口(指定SSH端口)" >> "$HOSTS_FILE"
echo "# 3. #开头的行是注释,会被忽略" >> "$HOSTS_FILE"
echo "# 4. 空行会被忽略" >> "$HOSTS_FILE"
echo "" >> "$HOSTS_FILE"
echo "# 示例:" >> "$HOSTS_FILE"
echo "# 192.168.1.10" >> "$HOSTS_FILE"
echo "# 192.168.1.11:2222" >> "$HOSTS_FILE"
echo "# 192.168.1.12" >> "$HOSTS_FILE"
echo "" >> "$HOSTS_FILE"
echo "# 请在下方添加您的主机:" >> "$HOSTS_FILE"
echo "10.150.10.83" >> "$HOSTS_FILE"
echo "10.150.10.86" >> "$HOSTS_FILE"
echo "10.150.10.87" >> "$HOSTS_FILE"
info "示例IP配置文件已生成"
info "请编辑 $HOSTS_FILE 添加您的主机,然后重新运行脚本"
exit 0
}
# 读取IP配置文件
read_hosts_file() {
info "读取IP配置文件: $HOSTS_FILE"
if [ ! -f "$HOSTS_FILE" ]; then
warn "IP配置文件不存在: $HOSTS_FILE"
read -p "是否生成示例配置文件? (y/n): " generate_answer
if [[ "$generate_answer" == "y" || "$generate_answer" == "Y" ]]; then
generate_sample_hosts_file
else
error "请创建IP配置文件或修改脚本中的HOSTS_FILE配置项"
exit 1
fi
fi
# 读取文件,过滤空行和注释行
REMOTE_HOSTS=()
while IFS= read -r line; do
# 跳过空行和注释行
if [[ -z "$line" || "$line" =~ ^# ]]; then
continue
fi
# 去除首尾空格
line=$(echo "$line" | xargs)
if [[ -n "$line" ]]; then
REMOTE_HOSTS+=("$line")
fi
done < "$HOSTS_FILE"
if [ ${#REMOTE_HOSTS[@]} -eq 0 ]; then
error "IP配置文件中没有有效的主机"
info "请编辑 $HOSTS_FILE 添加您的主机"
exit 1
fi
info "成功读取 ${#REMOTE_HOSTS[@]} 个主机"
}
# 检查本地依赖
check_dependencies() {
info "检查本地依赖..."
local deps=("scp" "ssh" "tar" "timeout" "stat" "md5sum" "docker-compose")
for dep in "${deps[@]}"; do
if ! command -v "$dep" &> /dev/null; then
if [ "$dep" = "docker-compose" ]; then
warn "未找到docker-compose命令,将无法自动重启Prometheus"
else
error "未找到命令: $dep"
exit 1
fi
fi
done
# 检查promtool(用于配置语法检查)
if command -v promtool &> /dev/null; then
PROMTOOL_AVAILABLE=true
info "检测到promtool,将自动检查Prometheus配置语法"
else
PROMTOOL_AVAILABLE=false
warn "未找到promtool,将跳过配置语法检查"
info "安装promtool: yum install -y prometheus2"
fi
# 检查sshpass(密码认证需要,密钥认证不需要)
if ! command -v sshpass &> /dev/null; then
warn "未找到sshpass命令"
info "如果您使用密码认证,请先安装sshpass:"
info " CentOS/RHEL 7/8/9: yum install -y epel-release && yum install -y sshpass"
info " CentOS/RHEL 6: yum install -y https://dl.fedoraproject.org/pub/e pel/epel-release-latest-6.noarch.rpm && yum install -y sshpass"
info ""
info "如果您使用SSH密钥认证,则无需安装sshpass"
read -p "是否继续执行? (y/n): " continue_answer
if [[ "$continue_answer" != "y" && "$continue_answer" != "Y" ]]; then
info "退出脚本"
exit 0
fi
else
info "sshpass已安装 (版本: $(sshpass -V 2>&1 | head -1 | awk '{print $2} '))"
fi
info "所有依赖检查通过"
}
# 验证本地已解压目录
verify_local_directory() {
info "验证本地已解压目录: $LOCAL_EXTRACT_DIR"
if [ ! -d "$LOCAL_EXTRACT_DIR" ]; then
error "本地目录不存在: $LOCAL_EXTRACT_DIR"
info "请将解压好的node_exporter目录放在上述路径,或修改脚本中的LOCAL_EXT RACT_DIR配置项"
exit 1
fi
if [ ! -f "$LOCAL_EXTRACT_DIR/node_exporter" ]; then
error "本地目录中缺少node_exporter二进制文件"
info "请确保您的目录结构正确,包含node_exporter可执行文件"
exit 1
fi
# 计算关键文件MD5用于远程校验
LOCAL_BINARY_MD5=$(md5sum "$LOCAL_EXTRACT_DIR/node_exporter" | awk '{print $ 1}')
info "本地二进制文件MD5: $LOCAL_BINARY_MD5"
info "本地目录验证通过"
}
# 验证Prometheus配置文件
verify_prometheus_config() {
info "验证Prometheus配置文件..."
if [ ! -f "$PROMETHEUS_CONFIG_FILE" ]; then
error "Prometheus配置文件不存在: $PROMETHEUS_CONFIG_FILE"
info "请修改脚本中的PROMETHEUS_CONFIG_FILE配置项为正确路径"
exit 1
fi
if [ ! -f "$DOCKER_COMPOSE_PATH" ]; then
error "docker-compose文件不存在: $DOCKER_COMPOSE_PATH"
info "请修改脚本中的DOCKER_COMPOSE_PATH配置项为正确路径"
exit 1
fi
# 检查配置文件中是否包含scrape_configs部分
if ! grep -q "^scrape_configs:" "$PROMETHEUS_CONFIG_FILE"; then
error "Prometheus配置文件中未找到scrape_configs部分"
exit 1
fi
# 检查配置文件是否可写
if [ ! -w "$PROMETHEUS_CONFIG_FILE" ]; then
error "Prometheus配置文件不可写: $PROMETHEUS_CONFIG_FILE"
info "请检查文件权限"
exit 1
fi
# 检查文件末尾是否有换行符
if [ -n "$(tail -c1 "$PROMETHEUS_CONFIG_FILE")" ]; then
warn "检测到配置文件末尾没有换行符,将自动修复"
echo "" >> "$PROMETHEUS_CONFIG_FILE"
info "已添加换行符到配置文件末尾"
fi
info "Prometheus配置文件验证通过"
}
# 备份Prometheus配置文件
backup_prometheus_config() {
if [ "$AUTO_BACKUP_CONFIG" != true ]; then
return
fi
LAST_BACKUP_FILE="${PROMETHEUS_CONFIG_FILE}.bak.$(date +%Y%m%d_%H%M%S)"
info "备份Prometheus配置文件到: $LAST_BACKUP_FILE"
if cp "$PROMETHEUS_CONFIG_FILE" "$LAST_BACKUP_FILE"; then
info "配置文件备份成功"
else
warn "配置文件备份失败,继续执行..."
LAST_BACKUP_FILE=""
fi
}
# 回滚到上一次备份的配置文件
rollback_prometheus_config() {
if [ -z "$LAST_BACKUP_FILE" ] || [ ! -f "$LAST_BACKUP_FILE" ]; then
error "没有可用的备份文件,无法回滚"
return 1
fi
warn "检测到配置错误,正在回滚到备份文件: $LAST_BACKUP_FILE"
if cp "$LAST_BACKUP_FILE" "$PROMETHEUS_CONFIG_FILE"; then
info "配置文件已回滚"
return 0
else
error "回滚失败"
return 1
fi
}
# 检查Prometheus配置语法
check_prometheus_config_syntax() {
if [ "$AUTO_CHECK_CONFIG" != true ] || [ "$PROMTOOL_AVAILABLE" != true ]; th en
return 0
fi
info "检查Prometheus配置语法..."
if promtool check config "$PROMETHEUS_CONFIG_FILE" &> /dev/null; then
info "Prometheus配置语法正确"
return 0
else
error "Prometheus配置语法错误"
promtool check config "$PROMETHEUS_CONFIG_FILE"
return 1
fi
}
# ==============================================
# 纯echo追加版配置添加函数(核心修改)
# ==============================================
update_prometheus_config() {
local hostname="$1"
local ip="$2"
local target="${ip}:${NODE_EXPORTER_PORT}"
info "更新Prometheus配置文件(纯echo追加方式)..."
# 检查是否已经存在相同的job_name
if grep -q "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE"; then
warn "Prometheus配置中已存在job_name: $hostname,跳过添加"
return 0
fi
# 检查是否已经存在相同的target
if grep -q "targets: \[\"$target\"\]" "$PROMETHEUS_CONFIG_FILE"; then
warn "Prometheus配置中已存在target: $target,跳过添加"
return 0
fi
# 备份配置文件
backup_prometheus_config
# 确保文件末尾有换行符(关键前置步骤)
if [ -n "$(tail -c1 "$PROMETHEUS_CONFIG_FILE")" ]; then
warn "检测到配置文件末尾没有换行符,自动添加"
echo "" >> "$PROMETHEUS_CONFIG_FILE"
fi
# 纯echo逐行追加 - 最可靠的YAML写入方式
info "开始逐行添加配置..."
# 第一行:job_name(2个空格缩进)
echo " - job_name: \"$hostname\"" >> "$PROMETHEUS_CONFIG_FILE"
if grep -q "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE"; then
info " ✅ 第1行写入成功: job_name: \"$hostname\""
else
error " ❌ 第1行写入失败"
return 1
fi
# 第二行:static_configs4个空格缩进)
echo " static_configs:" >> "$PROMETHEUS_CONFIG_FILE"
if grep -A1 "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE" | grep -q "s tatic_configs:"; then
info " ✅ 第2行写入成功: static_configs:"
else
error " ❌ 第2行写入失败"
# 回滚:删除刚才添加的第一行
sed -i "/job_name: \"$hostname\"/d" "$PROMETHEUS_CONFIG_FILE"
return 1
fi
# 第三行:targets(6个空格缩进)
echo " - targets: [\"$target\"]" >> "$PROMETHEUS_CONFIG_FILE"
if grep -A2 "job_name: \"$hostname\"" "$PROMETHEUS_CONFIG_FILE" | grep -q "t argets: \[\"$target\"\]"; then
info " ✅ 第3行写入成功: targets: [\"$target\"]"
else
error " ❌ 第3行写入失败"
# 回滚:删除刚才添加的前两行
sed -i "/job_name: \"$hostname\"/{N;d;}" "$PROMETHEUS_CONFIG_FILE"
return 1
fi
info "✅ 所有配置行写入成功!"
info "已添加的完整配置块:"
info " - job_name: \"$hostname\""
info " static_configs:"
info " - targets: [\"$target\"]"
# 检查配置语法
if check_prometheus_config_syntax; then
NEED_RESTART_PROMETHEUS=true
return 0
else
# 语法错误,完整回滚
rollback_prometheus_config
return 1
fi
}
# 从Prometheus配置中移除监控节点(也使用纯echo方式重建)
remove_from_prometheus_config() {
local identifier="$1" # 可以是hostname或ip:port
info "从Prometheus配置中移除监控节点: $identifier"
# 备份配置文件
backup_prometheus_config
# 尝试按job_name移除
if grep -q "job_name: \"$identifier\"" "$PROMETHEUS_CONFIG_FILE"; then
info "找到job_name: $identifier,正在移除..."
# 使用sed删除对应的job块(匹配job_name行及其后两行)
sed -i "/job_name: \"$identifier\"/{N;N;d;}" "$PROMETHEUS_CONFIG_FILE"
# 检查是否移除成功
if ! grep -q "job_name: \"$identifier\"" "$PROMETHEUS_CONFIG_FILE"; then
info "✅ 已从Prometheus配置中移除job: $identifier"
# 检查配置语法
if check_prometheus_config_syntax; then
NEED_RESTART_PROMETHEUS=true
return 0
else
rollback_prometheus_config
return 1
fi
fi
fi
# 尝试按target移除
local target_pattern="targets: \[\"$identifier\"\]"
if grep -q "$target_pattern" "$PROMETHEUS_CONFIG_FILE"; then
info "找到target: $identifier,正在移除..."
# 使用sed删除对应的job块(向上查找两行找到job_name,然后删除三行)
sed -i "/$target_pattern/{N;N;N;d;}" "$PROMETHEUS_CONFIG_FILE"
# 检查是否移除成功
if ! grep -q "$target_pattern" "$PROMETHEUS_CONFIG_FILE"; then
info "✅ 已从Prometheus配置中移除target: $identifier"
# 检查配置语法
if check_prometheus_config_syntax; then
NEED_RESTART_PROMETHEUS=true
return 0
else
rollback_prometheus_config
return 1
fi
fi
fi
warn "在Prometheus配置中未找到节点: $identifier"
return 1
}
# 重启Prometheus服务
restart_prometheus() {
if [ "$AUTO_RESTART_PROMETHEUS" != true ] || [ "$NEED_RESTART_PROMETHEUS" != true ]; then
return
fi
info "重启Prometheus服务使配置生效..."
# 切换到docker-compose所在目录
local compose_dir=$(dirname "$DOCKER_COMPOSE_PATH")
cd "$compose_dir"
if docker-compose restart prometheus; then
info "✅ Prometheus重启成功,新配置已生效"
# 等待Prometheus启动完成
sleep 10
# 检查Prometheus是否正常运行
if docker-compose ps | grep -q "prometheus.*Up"; then
info "Prometheus服务运行正常"
else
warn "Prometheus服务可能未正常启动,请手动检查"
fi
else
error "Prometheus重启失败,请手动执行以下命令检查:"
error " cd $compose_dir"
error " docker-compose logs -f prometheus"
fi
}
# 远程执行命令
remote_exec() {
local host="$1"
local port="$2"
local user="$3"
local password="$4"
local cmd="$5"
# SSH通用选项:完全静默警告、禁用主机密钥检查、设置超时
local ssh_options="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/nu ll"
ssh_options="$ssh_options -o LogLevel=ERROR"
ssh_options="$ssh_options -o ConnectTimeout=$SSH_CONNECT_TIMEOUT"
ssh_options="$ssh_options -o ServerAliveInterval=$SSH_SERVER_ALIVE_INTERVAL"
ssh_options="$ssh_options -o ServerAliveCountMax=3"
ssh_options="$ssh_options -o TCPKeepAlive=yes"
# 使用timeout命令防止远程执行无限卡住
local exit_code=0
if [ -n "$password" ]; then
# 使用环境变量传递密码,避免特殊字符问题
SSHPASS="$password" sshpass -e ssh $ssh_options -p "$port" "${user}@${ho st}" "$cmd"
exit_code=$?
else
# 使用密钥认证
ssh $ssh_options -p "$port" "${user}@${host}" "$cmd"
exit_code=$?
fi
# 正确的错误代码解释
if [ $exit_code -ne 0 ]; then
if [ $exit_code -eq 124 ]; then
error "远程命令执行超时(超过${REMOTE_CMD_TIMEOUT}秒)"
elif [ $exit_code -eq 255 ]; then
error "SSH连接失败:网络问题或远程主机不可达"
else
# 1-254都是远程命令本身的退出代码
error "远程命令执行失败,退出代码: $exit_code"
fi
fi
return $exit_code
}
# 远程复制目录(增强版,带完整性校验和重试机制)
remote_copy_dir_with_verify() {
local host="$1"
local port="$2"
local user="$3"
local password="$4"
local src_dir="$5"
local dest_dir="$6"
local expected_md5="$7"
# SCP通用选项:完全静默警告
local scp_options="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/nu ll"
scp_options="$scp_options -o LogLevel=ERROR"
scp_options="$scp_options -o ConnectTimeout=$SSH_CONNECT_TIMEOUT"
scp_options="$scp_options -o ServerAliveInterval=$SSH_SERVER_ALIVE_INTERVAL"
scp_options="$scp_options -o ServerAliveCountMax=3"
scp_options="$scp_options -o TCPKeepAlive=yes"
local retry_count=0
local success=0
while [ $retry_count -lt $MAX_TRANSFER_RETRIES ]; do
if [ $retry_count -gt 0 ]; then
warn "目录传输重试 $retry_count/$MAX_TRANSFER_RETRIES..."
# 先删除可能不完整的远程目录
remote_exec "$host" "$port" "$user" "$password" "rm -rf '$dest_dir'" &> /dev/null
sleep 2
fi
info "正在传输目录: $src_dir -> $dest_dir"
if [ -n "$password" ]; then
SSHPASS="$password" sshpass -e scp -r $scp_options -P "$port" "$src_ dir"/* "${user}@${host}:${dest_dir}/"
else
scp -r $scp_options -P "$port" "$src_dir"/* "${user}@${host}:${dest_ dir}/"
fi
if [ $? -eq 0 ]; then
# 验证远程二进制文件MD5
info "验证远程文件完整性..."
local remote_md5=$(remote_exec "$host" "$port" "$user" "$password" " md5sum '$dest_dir/node_exporter' 2>/dev/null | cut -d' ' -f1 || echo 'ERROR_MD5_ FAILED'")
if [ "$remote_md5" = "$expected_md5" ]; then
info "✅ 目录传输成功,MD5校验通过: $remote_md5"
success=1
break
else
error "MD5校验不匹配!本地: $expected_md5,远程: $remote_md5"
fi
else
error "目录传输失败"
fi
((retry_count++))
done
if [ $success -eq 0 ]; then
error "目录传输失败,已重试 $MAX_TRANSFER_RETRIES"
return 1
fi
return 0
}
# SSH连接诊断函数
diagnose_ssh_connection() {
local host="$1"
local port="$2"
local user="$3"
local password="$4"
info "开始SSH连接诊断..."
# 测试网络连通性
info "1. 测试网络连通性..."
if ping -c 5 -W 2 "$host" &> /dev/null; then
info " ✓ 网络连通性正常"
else
error " ✗ 网络连通性失败,无法ping通主机"
fi
# 测试端口是否开放
info "2. 测试SSH端口 $port 是否开放..."
if command -v nc &> /dev/null; then
if nc -z -w 10 "$host" "$port"; then
info " ✓ SSH端口 $port 开放"
else
error " ✗ SSH端口 $port 关闭或被防火墙阻止"
fi
else
warn " 未找到nc命令,跳过端口测试"
fi
# 测试SSH服务版本
info "3. 测试SSH服务响应..."
if timeout 10 bash -c "echo '' | telnet $host $port 2>&1 | grep -i 'ssh'" &> /dev/null; then
info " ✓ SSH服务正在运行"
else
error " ✗ SSH服务未响应"
fi
# 测试身份验证
info "4. 测试身份验证..."
if [ -n "$password" ]; then
info " 使用密码认证方式"
SSHPASS="$password" sshpass -e ssh -o StrictHostKeyChecking=no -o LogLev el=ERROR -o ConnectTimeout=10 -p "$port" "${user}@${host}" "echo '认证成功'" 2>& 1
else
info " 使用密钥认证方式"
ssh -o StrictHostKeyChecking=no -o LogLevel=ERROR -o ConnectTimeout=10 - p "$port" "${user}@${host}" "echo '认证成功'" 2>&1
fi
if [ $? -eq 0 ]; then
info " ✓ 身份验证成功"
else
error " ✗ 身份验证失败"
info ""
info "常见解决方案:"
info " 1. 检查用户名和密码是否正确"
info " 2. 确认远程主机允许密码认证(PasswordAuthentication yes"
info " 3. 确认远程主机允许root登录(PermitRootLogin yes"
info " 4. 检查远程主机的防火墙和SELinux设置"
info " 5. 查看远程主机的SSH日志:tail -f /var/log/secure"
fi
# 测试远程磁盘空间(使用cut代替awk)
info "5. 测试远程磁盘空间..."
local disk_space=$(remote_exec "$host" "$port" "$user" "$password" "df -P /t mp | tail -1 | cut -d' ' -f4")
if [ -n "$disk_space" ] && [ "$disk_space" -gt 10240 ]; then
info " ✓ 远程/tmp目录可用空间: $(($disk_space / 1024)) MB"
else
error " ✗ 远程/tmp目录空间不足或无法访问"
fi
info "诊断完成"
}
# 在单个主机上安装Node Exporter
install_on_host() {
local host_port="$1"
local user="$2"
local password="$3"
local host="${host_port%:*}"
local port="22"
# 提取端口号
if [[ "$host_port" == *:* ]]; then
port="${host_port#*:}"
fi
info "============================================="
info "开始处理主机: ${host}:${port}"
# 测试SSH连接
info "测试SSH连接..."
if ! remote_exec "$host" "$port" "$user" "$password" "echo 'SSH连接成功'" &> /dev/null; then
error "无法连接到主机 ${host}:${port}"
# 询问是否进行诊断
read -p "是否进行SSH连接诊断? (y/n): " diagnose_answer
if [[ "$diagnose_answer" == "y" || "$diagnose_answer" == "Y" ]]; then
diagnose_ssh_connection "$host" "$port" "$user" "$password"
fi
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
fi
info "✅ SSH连接成功"
# 获取系统版本信息
info "检测系统版本..."
local os_release
os_release=$(remote_exec "$host" "$port" "$user" "$password" "cat /etc/redha t-release 2>/dev/null || cat /etc/issue 2>/dev/null | head -1") || {
error "无法获取系统版本信息"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
}
info "检测到系统: $os_release"
# 确定系统类型和服务管理方式
local system_type="unknown"
local service_manager="unknown"
if echo "$os_release" | grep -qiE "centos.*6|red.*hat.*6|rhel.*6"; then
system_type="rhel6"
service_manager="sysvinit"
elif echo "$os_release" | grep -qiE "centos.*5|red.*hat.*5|rhel.*5"; then
system_type="rhel5"
service_manager="sysvinit"
elif echo "$os_release" | grep -qiE "centos.*7|red.*hat.*7|rhel.*7"; then
system_type="rhel7"
service_manager="systemd"
elif echo "$os_release" | grep -qiE "red.*hat.*9|rhel.*9"; then
system_type="rhel9"
service_manager="systemd"
else
warn "不支持的系统类型,尝试使用通用方式安装"
# 尝试检测服务管理器
if remote_exec "$host" "$port" "$user" "$password" "command -v systemctl &> /dev/null"; then
service_manager="systemd"
else
service_manager="sysvinit"
fi
fi
info "系统类型: $system_type, 服务管理器: $service_manager"
# 清理并创建远程临时目录
info "清理并创建远程临时目录..."
remote_exec "$host" "$port" "$user" "$password" "rm -rf $REMOTE_TMP_DIR && m kdir -p $REMOTE_TMP_DIR"
# 传输本地已解压目录到远程主机(带MD5完整性校验)
info "传输安装文件到远程主机..."
if ! remote_copy_dir_with_verify "$host" "$port" "$user" "$password" "$LOCAL _EXTRACT_DIR" "$REMOTE_TMP_DIR" "$LOCAL_BINARY_MD5"; then
error "传输安装文件失败"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
fi
# 远程执行安装脚本
info "开始远程安装..."
local install_script=$(cat << EOF
#!/bin/bash
set -euo pipefail
cd "$REMOTE_TMP_DIR"
# 验证文件完整性
echo "验证远程文件完整性..."
if ! md5sum "node_exporter" | cut -d' ' -f1 | grep -q "$LOCAL_BINARY_MD5"; then
echo "ERROR: 远程文件MD5校验失败!"
echo "本地MD5: $LOCAL_BINARY_MD5"
echo "远程MD5: \$(md5sum "node_exporter" | cut -d' ' -f1)"
exit 1
fi
echo "文件完整性验证通过"
# 创建node_exporter用户(如果不存在)
if ! id node_exporter &> /dev/null; then
useradd -M -s /sbin/nologin node_exporter 2>/dev/null || adduser -M -s /sbin /nologin node_exporter
fi
# 安装二进制文件
cp node_exporter /usr/local/bin/
chmod +x /usr/local/bin/node_exporter
chown node_exporter:node_exporter /usr/local/bin/node_exporter
# 创建数据目录
mkdir -p /var/lib/node_exporter
chown node_exporter:node_exporter /var/lib/node_exporter
# RHEL9专属优化:配置SELinux允许node_exporter运行
if [ "$system_type" = "rhel9" ]; then
echo "配置RHEL9 SELinux规则..."
# 允许node_exporter绑定到任何端口
semanage port -a -t http_port_t -p tcp ${NODE_EXPORTER_PORT} 2>/dev/null || true
# 允许node_exporter读取系统信息
setsebool -P domain_can_mmap_files 1 2>/dev/null || true
fi
# 安装服务文件
if [ "$service_manager" = "systemd" ]; then
# Systemd服务文件
cat > /etc/systemd/system/node_exporter.service << 'SERVICE_EOF'
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter \
--collector.systemd \
--collector.processes \
--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc|run|va r/lib/docker)($|/)" \
--collector.cpu.info \
--collector.meminfo \
--collector.loadavg \
--collector.diskstats \
--collector.netdev \
--web.listen-address=:${NODE_EXPORTER_PORT}
Restart=always
RestartSec=5
Delegate=yes
ProtectSystem=strict
ProtectHome=yes
PrivateTmp=yes
ProtectKernelTunables=no
ProtectControlGroups=no
[Install]
WantedBy=multi-user.target
SERVICE_EOF
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
else
# SysVinit服务文件
cat > /etc/init.d/node_exporter << 'SERVICE_EOF'
#!/bin/bash
# chkconfig: 2345 90 10
# description: Prometheus Node Exporter
NAME="node_exporter"
DAEMON="/usr/local/bin/\${NAME}"
PIDFILE="/var/run/\${NAME}.pid"
USER="node_exporter"
OPTIONS="--collector.systemd --collector.processes --collector.filesystem.ignore d-mount-points=\"^/(sys|proc|dev|host|etc)(\\\$|/)\" --web.listen-address=:${NOD E_EXPORTER_PORT}"
start() {
if [ -f "\$PIDFILE" ]; then
echo "\$NAME is already running"
exit 1
fi
echo "Starting \$NAME..."
su -s /bin/sh \$USER -c "\$DAEMON \$OPTIONS &"
echo \$! > "\$PIDFILE"
echo "\$NAME started"
}
stop() {
if [ ! -f "\$PIDFILE" ]; then
echo "\$NAME is not running"
exit 1
fi
echo "Stopping \$NAME..."
kill \$(cat "\$PIDFILE")
rm -f "\$PIDFILE"
echo "\$NAME stopped"
}
status() {
if [ -f "\$PIDFILE" ]; then
echo "\$NAME is running (PID: \$(cat "\$PIDFILE"))"
else
echo "\$NAME is not running"
fi
}
restart() {
stop
sleep 2
start
}
case "\$1" in
start) start ;;
stop) stop ;;
status) status ;;
restart) restart ;;
*) echo "Usage: \$0 {start|stop|status|restart}"; exit 1 ;;
esac
SERVICE_EOF
chmod +x /etc/init.d/node_exporter
chkconfig --add node_exporter 2>/dev/null || true
chkconfig node_exporter on 2>/dev/null || true
service node_exporter start
fi
# 配置防火墙
if [ "$system_type" = "rhel5" ] || [ "$system_type" = "rhel6" ]; then
if command -v iptables &> /dev/null; then
iptables -I INPUT -p tcp --dport ${NODE_EXPORTER_PORT} -j ACCEPT
if [ -f /etc/sysconfig/iptables ]; then
service iptables save 2>/dev/null || true
fi
fi
elif [ "$system_type" = "rhel7" ] || [ "$system_type" = "rhel9" ]; then
if command -v firewall-cmd &> /dev/null; then
firewall-cmd --permanent --add-port=${NODE_EXPORTER_PORT}/tcp 2>/dev/nul l || true
firewall-cmd --reload 2>/dev/null || true
fi
fi
# 验证安装
echo "等待服务启动..."
sleep 5
for i in 1 2 3; do
echo "验证尝试 \$i/3..."
if command -v curl &> /dev/null; then
if curl -s --connect-timeout 5 http://localhost:${NODE_EXPORTER_PORT}/me trics &> /dev/null; then
echo "SUCCESS: Node Exporter安装成功并正在运行"
exit 0
fi
elif command -v wget &> /dev/null; then
if wget -q -T 5 -O /dev/null http://localhost:${NODE_EXPORTER_PORT}/metr ics; then
echo "SUCCESS: Node Exporter安装成功并正在运行"
exit 0
fi
else
if ps aux | grep -v grep | grep node_exporter &> /dev/null; then
echo "SUCCESS: Node Exporter进程已启动"
exit 0
fi
fi
sleep 3
done
echo "ERROR: Node Exporter安装失败"
systemctl status node_exporter 2>/dev/null || service node_exporter status 2>/de v/null
exit 1
EOF
)
# 将安装脚本写入远程主机并执行
echo "$install_script" > "/tmp/install_remote.sh"
# 传输安装脚本
if ! remote_exec "$host" "$port" "$user" "$password" "cat > '$REMOTE_TMP_DIR /install_remote.sh'" < "/tmp/install_remote.sh"; then
error "传输安装脚本失败"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
fi
remote_exec "$host" "$port" "$user" "$password" "chmod +x $REMOTE_TMP_DIR/in stall_remote.sh"
info "执行远程安装脚本..."
if remote_exec "$host" "$port" "$user" "$password" "$REMOTE_TMP_DIR/install_ remote.sh"; then
info "✅ 主机 ${host}:${port} 安装成功"
((SUCCESS_COUNT++))
# 获取远程主机名并更新Prometheus配置
info "获取远程主机名..."
local remote_hostname=$(remote_exec "$host" "$port" "$user" "$password" "hostname -s 2>/dev/null || hostname")
if [ -z "$remote_hostname" ] || [ "$remote_hostname" = "ERROR_MD5_FAILED " ]; then
warn "无法获取远程主机名,使用IP作为job_name"
remote_hostname="$host"
fi
info "远程主机名: $remote_hostname"
# 更新Prometheus配置(纯echo方式)
update_prometheus_config "$remote_hostname" "$host"
else
error "主机 ${host}:${port} 安装失败"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
fi
# 清理远程临时文件
info "清理远程临时文件..."
remote_exec "$host" "$port" "$user" "$password" "rm -rf $REMOTE_TMP_DIR" &> /dev/null || true
return 0
}
# 在单个主机上卸载Node Exporter
uninstall_on_host() {
local host_port="$1"
local user="$2"
local password="$3"
local host="${host_port%:*}"
local port="22"
# 提取端口号
if [[ "$host_port" == *:* ]]; then
port="${host_port#*:}"
fi
info "============================================="
info "开始卸载主机: ${host}:${port}"
# 测试SSH连接
info "测试SSH连接..."
if ! remote_exec "$host" "$port" "$user" "$password" "echo 'SSH连接成功'" &> /dev/null; then
error "无法连接到主机 ${host}:${port}"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
fi
info "✅ SSH连接成功"
# 获取远程主机名用于移除Prometheus配置
info "获取远程主机名..."
local remote_hostname=$(remote_exec "$host" "$port" "$user" "$password" "hos tname -s 2>/dev/null || hostname")
if [ -z "$remote_hostname" ] || [ "$remote_hostname" = "ERROR_MD5_FAILED" ]; then
warn "无法获取远程主机名,将使用IP:PORT从Prometheus配置中移除"
remote_hostname="${host}:${NODE_EXPORTER_PORT}"
fi
info "远程主机名: $remote_hostname"
# 远程执行卸载脚本
info "开始远程卸载..."
local uninstall_script=$(cat << EOF
#!/bin/bash
set -euo pipefail
echo "停止node_exporter服务..."
if command -v systemctl &> /dev/null; then
systemctl stop node_exporter 2>/dev/null || true
systemctl disable node_exporter 2>/dev/null || true
rm -f /etc/systemd/system/node_exporter.service
systemctl daemon-reload
else
service node_exporter stop 2>/dev/null || true
chkconfig node_exporter off 2>/dev/null || true
chkconfig --del node_exporter 2>/dev/null || true
rm -f /etc/init.d/node_exporter
fi
echo "删除node_exporter二进制文件..."
rm -f /usr/local/bin/node_exporter
echo "删除node_exporter用户和数据目录..."
userdel node_exporter 2>/dev/null || true
rm -rf /var/lib/node_exporter
echo "删除防火墙规则..."
if command -v firewall-cmd &> /dev/null; then
firewall-cmd --permanent --remove-port=${NODE_EXPORTER_PORT}/tcp 2>/dev/null || true
firewall-cmd --reload 2>/dev/null || true
elif command -v iptables &> /dev/null; then
iptables -D INPUT -p tcp --dport ${NODE_EXPORTER_PORT} -j ACCEPT 2>/dev/null || true
if [ -f /etc/sysconfig/iptables ]; then
service iptables save 2>/dev/null || true
fi
fi
echo "SUCCESS: Node Exporter卸载成功"
exit 0
EOF
)
# 将卸载脚本写入远程主机并执行
echo "$uninstall_script" > "/tmp/uninstall_remote.sh"
# 传输卸载脚本
if ! remote_exec "$host" "$port" "$user" "$password" "cat > '/tmp/uninstall_ remote.sh'" < "/tmp/uninstall_remote.sh"; then
error "传输卸载脚本失败"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
return 1
fi
remote_exec "$host" "$port" "$user" "$password" "chmod +x /tmp/uninstall_rem ote.sh"
info "执行远程卸载脚本..."
if remote_exec "$host" "$port" "$user" "$password" "/tmp/uninstall_remote.sh "; then
info "✅ 主机 ${host}:${port} 卸载成功"
((SUCCESS_COUNT++))
# 从Prometheus配置中移除该节点
remove_from_prometheus_config "$remote_hostname"
if [ $? -ne 0 ]; then
remove_from_prometheus_config "${host}:${NODE_EXPORTER_PORT}"
fi
else
error "主机 ${host}:${port} 卸载失败"
((FAILURE_COUNT++))
FAILURE_HOSTS+=("$host_port")
fi
# 清理远程临时文件
remote_exec "$host" "$port" "$user" "$password" "rm -f /tmp/uninstall_remote .sh" &> /dev/null || true
return 0
}
# 批量安装模式
batch_install() {
read_hosts_file
info "进入批量安装模式"
info "将安装到以下 ${#REMOTE_HOSTS[@]} 个主机:"
for host in "${REMOTE_HOSTS[@]}"; do
blue " - $host"
done
read -p "确认开始批量安装? (y/n): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
info "取消批量安装"
return
fi
# 遍历所有主机进行安装
for host_port in "${REMOTE_HOSTS[@]}"; do
install_on_host "$host_port" "$REMOTE_USER" "$REMOTE_PASSWORD"
done
}
# 批量卸载模式
batch_uninstall() {
read_hosts_file
info "进入批量卸载模式"
info "将从以下 ${#REMOTE_HOSTS[@]} 个主机卸载:"
for host in "${REMOTE_HOSTS[@]}"; do
blue " - $host"
done
read -p "确认开始批量卸载? (y/n): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
info "取消批量卸载"
return
fi
# 遍历所有主机进行卸载
for host_port in "${REMOTE_HOSTS[@]}"; do
uninstall_on_host "$host_port" "$REMOTE_USER" "$REMOTE_PASSWORD"
done
}
# 指定单主机安装模式
single_install() {
info "进入指定单主机安装模式"
read -p "请输入目标主机IP地址: " target_host
read -p "请输入SSH端口号(默认22): " target_port
target_port=${target_port:-22}
read -p "请输入登录用户名(默认root): " target_user
target_user=${target_user:-root}
read -s -p "请输入登录密码(留空使用密钥认证): " target_password
echo ""
info "您输入的信息:"
blue " 主机: ${target_host}:${target_port}"
blue " 用户: ${target_user}"
if [ -z "$target_password" ]; then
blue " 认证方式: SSH密钥认证"
else
blue " 认证方式: 密码认证"
fi
read -p "确认信息正确并开始安装? (y/n): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
info "取消安装"
return
fi
install_on_host "${target_host}:${target_port}" "$target_user" "$target_pass word"
}
# 指定单主机卸载模式
single_uninstall() {
info "进入指定单主机卸载模式"
read -p "请输入目标主机IP地址: " target_host
read -p "请输入SSH端口号(默认22): " target_port
target_port=${target_port:-22}
read -p "请输入登录用户名(默认root): " target_user
target_user=${target_user:-root}
read -s -p "请输入登录密码(留空使用密钥认证): " target_password
echo ""
info "您输入的信息:"
blue " 主机: ${target_host}:${target_port}"
blue " 用户: ${target_user}"
if [ -z "$target_password" ]; then
blue " 认证方式: SSH密钥认证"
else
blue " 认证方式: 密码认证"
fi
read -p "确认信息正确并开始卸载? (y/n): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
info "取消卸载"
return
fi
uninstall_on_host "${target_host}:${target_port}" "$target_user" "$target_pa ssword"
}
# 手动添加Prometheus监控节点(纯echo方式)
manual_add_prometheus() {
info "进入手动添加Prometheus监控节点模式"
read -p "请输入job_name: " job_name
read -p "请输入target (格式: IP:端口): " target
update_prometheus_config "$job_name" "${target%:*}"
# 重启Prometheus
restart_prometheus
info "✅ 手动添加完成"
}
# 显示操作结果汇总
show_results() {
info "============================================="
info "操作结果汇总:"
info "总主机数: $((SUCCESS_COUNT + FAILURE_COUNT))"
info "成功: ${SUCCESS_COUNT}"
info "失败: ${FAILURE_COUNT}"
if [ ${FAILURE_COUNT} -gt 0 ]; then
error "失败的主机:"
for host in "${FAILURE_HOSTS[@]}"; do
error " - $host"
done
else
info "✅ 所有主机操作成功!"
fi
if [ "$NEED_RESTART_PROMETHEUS" = true ]; then
info ""
info "Prometheus配置已更新"
fi
}
# 主函数
main() {
info "Node Exporter远程管理脚本(纯echo追加版)"
info "使用本地已解压目录: $LOCAL_EXTRACT_DIR"
info "IP配置文件: $HOSTS_FILE"
info "Prometheus配置添加方式: 纯echo逐行追加"
info "============================================="
# 验证Prometheus配置
verify_prometheus_config
# 显示操作选择菜单
echo ""
blue "请选择操作:"
echo " 1. 批量安装(从IP配置文件读取主机列表)"
echo " 2. 指定安装(手动输入单台主机信息)"
echo " 3. 批量卸载(从IP配置文件读取主机列表)"
echo " 4. 指定卸载(手动输入单台主机信息)"
echo " 5. 手动添加Prometheus监控节点"
echo " 6. 生成示例IP配置文件"
echo " 7. SSH连接诊断工具"
echo " 8. 重新验证本地目录和Prometheus配置"
echo " 9. 退出"
echo ""
read -p "请输入选项(1-9): " operation_mode
case "$operation_mode" in
1)
verify_local_directory
batch_install
;;
2)
verify_local_directory
single_install
;;
3)
batch_uninstall
;;
4)
single_uninstall
;;
5)
manual_add_prometheus
exit 0
;;
6)
generate_sample_hosts_file
;;
7)
info "进入SSH连接诊断工具"
read -p "请输入目标主机IP地址: " diag_host
read -p "请输入SSH端口号(默认22): " diag_port
diag_port=${diag_port:-22}
read -p "请输入登录用户名(默认root): " diag_user
diag_user=${diag_user:-root}
read -s -p "请输入登录密码(留空使用密钥认证): " diag_password
echo ""
diagnose_ssh_connection "$diag_host" "$diag_port" "$diag_user" "$dia g_password"
exit 0
;;
8)
info "重新验证本地目录和Prometheus配置..."
verify_local_directory
verify_prometheus_config
info "✅ 所有验证完成,请重新运行脚本"
exit 0
;;
9)
info "退出脚本"
exit 0
;;
*)
error "无效的选项"
exit 1
;;
esac
# 显示结果
show_results
# 重启Prometheus(如果需要)
restart_prometheus
# 清理本地临时文件
info "清理本地临时文件..."
rm -f "/tmp/install_remote.sh" "/tmp/uninstall_remote.sh"
info "✅ 脚本执行完成"
}
# 运行主函数
check_dependencies
main "$@"