supervisor管理服务
# Python 进程管理工具 Supervisor 使用
## 配置文件路径
/etc/supervisor/supervisord.conf #主配置文件
/etc/supervisor/conf.d/listener.conf #监控告警服务路径
/data/script/listener.py #告警脚本路径
/usr/lib/systemd/system/supervisord.service #systemctl 管理配置文件路径,自行检查python3的路径
输入命令 supervisorctl 进入 supervisorctl 的 shell 交互界面,就可以在下面输入命令了。:
supervisorctl status # 查看程序状态
supervisorctl stop program_name # 关闭 指定的程序
supervisorctl start program_name # 启动 指定的程序
supervisorctl restart program_name # 重启 指定的程序
supervisorctl tail -f program_name # 查看 该程序的日志
supervisorctl update # 新增配置文件之后需要 执行
supervisorctl update # 重启配置文件修改过的程序(修改了配置,通过这个命令加载新的配置)
可以多个程序配置在一个配置文件里面
也可以单独配置配置文件,编辑新文件在 /etc/supervisor/conf.d/ 就行
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 配置文件实例 test.conf
[program:test_xxxxxx]
#脚本目录
directory=/data/test/
#脚本执行命令
command= /data/test/bin/test -config /data/test/conf/xxxxxx.toml -http-addr 0.0.0.0:8001
#supervisor启动的时候是否随着同时启动,默认True
autostart=false
#当程序exit的时候,这个program不会自动重启,默认unexpected,设置子进程挂掉后自动重启的情况,有三个选项,false,unexpected和true。如果为false的时候,无论什么情况下,都不会被重新启动,如果为unexpected,只有当进程的退出码不在下面的exitcodes里面定义的
autorestart=true
#这个选项是子进程启动多少秒之后,此时状态如果是running,则我们认为启动成功了。默认值为1
startsecs=1
#脚本运行的用户身份
user = root
#日志输出
stdout_logfile=/data/test/logs/xxxxxx.log
#把stderr重定向到stdout,默认 false
redirect_stderr = true
#stdout日志文件大小,默认 50MB
stdout_logfile_maxbytes = 200MB
#stdout日志文件备份数
stdout_logfile_backups = 2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 安装
没有pip3的话安装
sudo wget https://www.python.org/ftp/python/3.9.2/Python-3.9.2.tgz
sudo yum -y install bzip2-devel sqlite-devel openssl-devel readline-devel xz-devel xz-devel tk-devel gdbm-devel libffi-devel gcc
sudo tar zxvf Python-3.9.2.tgz
cd /opt/Python-3.9.2
sudo ./configure
sudo make
sudo make install
sudo pip3 install supervisor
sudo pip3 install requests
sudo mkdir -p /data/supervisor/
sudo mkdir -p /etc/supervisor/conf.d/
sudo mkdir -p /data/supervisor/logs/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
# 配置主配置文件
sudo cat >/etc/supervisor/supervisord.conf<<"EOF"
[unix_http_server]
file=/data/supervisor/supervisor.sock
; 修改为 /data/supervisor/目录
chmod=077
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
[supervisord]
logfile=/data/supervisor/supervisord.log
;日志文件,默认是 $CWD/supervisord.log
logfile_maxbytes=50MB
;日志文件大小,超出会rotate,默认 50MB,如果设成0,表示不限制大小
logfile_backups=10
;日志文件保留备份数量默认10,设为0表示不备份
loglevel=info
;日志级别,默认info,其它: debug,warn,trace
pidfile=/data/supervisor/supervisord.pid
;pid 文件
nodaemon=false
;是否在前台启动,默认是false,即以 daemon 的方式启动
minfds=1024
;可以打开的文件描述符的最小值,默认 1024
minprocs=200
;可以打开的进程数的最小值,默认 200
[supervisorctl]
serverurl=unix:///data/supervisor/supervisor.sock
;通过UNIX socket连接supervisord,路径与unix_http_server部分的file一致
;serverurl=http://127.0.0.1:9001
; 通过HTTP的方式连接supervisord
[include];包含其它配置文件
files = /etc/supervisor/conf.d/*.conf
;可以指定一个或多个以.ini结束的配置文件
EOF
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 配置自定义服务配置文件
上文的test.conf
# 配置systemctl管理
sudo cat >/usr/lib/systemd/system/supervisord.service <<"EOF"
# supervisord service for systemd (CentOS 7.0+)
# by ET-CS (https://github.com/ET-CS)
[Unit]
Description=Supervisor daemon
[Service]
User=root
Type=forking
ExecStart=/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
ExecStop=/usr/bin/supervisorctl $OPTIONS shutdown
ExecReload=/usr/bin/supervisorctl $OPTIONS reload
KillMode=process
Restart=on-failure
RestartSec=42s
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable supervisord.service
sudo systemctl start supervisord.service
sudo systemctl status supervisord.service
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 告警脚本
将listener.py拷贝到指定目录,配置listener服务即可
listener.conf
[eventlistener:listener]
events=PROCESS_STATE_EXITED,PROCESS_STATE_STOPPED,PROCESS_STATE_FATAL,PROCESS_LOG_STDERR,PROCESS_STATE_RUNNING ; 监控事件
command=python3 /data/script/listener.py ; 自定义的监控程序
autostart=true
autorestart=true
startsecs=1
user = root
stderr_logfile=/data/supervisor/listener_err.log
stdout_logfile=/data/supervisor/listener.log
stdout_logfile_maxbytes = 200MB
stdout_logfile_backups = 2
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python
# coding=utf-8
'''
Suprevisord Listener example.
'''
import requests
import json
import sys
import os
import socket
import time
def write_stdout(s):
sys.stdout.write(s)
sys.stdout.flush()
def write_stderr(s):
sys.stderr.write(s)
sys.stderr.flush()
def send_message(msg):
request_header = {x.x.x.x
"content-type": "application/json; charset=UTF-8",
"Authorization": "xxxxxxxxxxxxxxxxxx"
}
proxies = {'http': 'http://x.x.x.x:83', 'https': 'http://x.x.x.x:83'}
push_url = 'https://xxxxxxxxxxxxxxxxxxxxxxxxxxxx'
push_data = {
"roomname": "xxxxxxxxxxxxxxx",
"text": msg
}
try:
push_data = json.dumps(push_data)
requests.packages.urllib3.disable_warnings()
requests.post(url=push_url, headers=request_header, data=push_data, proxies=proxies, verify=False)
except print(0):
requests.post(url=push_url, headers=request_header, data=push_data, proxies=proxies, verify=False)
pass
def parseData(data):
tmp = data.split('\n')
pheaders = dict([x.split(':') for x in tmp[0].split()])
pdata = None
if len(tmp) > 1:
pdata = tmp[1]
return pheaders, pdata
def main():
# Only supervisord can run this listener, otherwise exit.
if not 'SUPERVISOR_SERVER_URL' in os.environ:
print("%s must be run as a supervisor listener.") % sys.argv[0]
return
# 获取主机名和ip
hostname = socket.gethostname()
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
while True:
# echo 'READY' and wait for event for stdin.
write_stdout('READY\n')
line = sys.stdin.readline() # read header line from stdin
headers = dict([x.split(':') for x in line.split()])
data = sys.stdin.read(int(headers['len'])) # read the event payload
if headers['eventname'] == 'PROCESS_STATE_EXITED' or\
headers['eventname'] == 'PROCESS_STATE_FATAL' or\
headers['eventname'] == 'PROCESS_STATE_RUNNING' or\
headers['eventname'] == 'PROCESS_STATE_STOPPED':
pheaders, pdata = parseData(data)
from_state = pheaders['from_state']
process_name = pheaders['processname']
if headers['eventname'] == 'PROCESS_STATE_EXITED' and\
not int(pheaders['expected']):
alertime = time.asctime(time.localtime(time.time()))
msg = ("❌❌xx测试环境进程%s(PID: %s)异常退出,请检查进程状态." % (process_name, pheaders['pid']) + '\n'
"主机名:" + hostname + '\n' +
"时间:" + alertime + '\n' +
"ip:" + ip)
send_message(msg)
if headers['eventname'] == 'PROCESS_STATE_RUNNING':
alertime = time.asctime(time.localtime(time.time()))
msg = ("✅✅ xx测试环境进程%s启动成功." % (process_name) + '\n'
"时间:" + alertime + '\n' +
"主机名:" + hostname + '\n' +
"ip:" + ip)
send_message(msg)
if headers['eventname'] == 'PROCESS_STATE_FATAL':
alertime = time.asctime(time.localtime(time.time()))
msg = ("❌❌xx测试环境进程%s启动失败,请检查进程状态." % (process_name) + '\n'
"时间:" + alertime + '\n' +
"主机名:" + hostname + '\n' +
"ip:" + ip)
send_message(msg)
elif headers['eventname'] == 'PROCESS_LOG_STDERR':
alertime = time.asctime(time.localtime(time.time()))
pheaders, pdata = parseData(data)
process_name = pheaders['processname']
pid = pheaders['pid']
msg = ("❌❌xx测试环境进程%s(PID: %s)错误输出,请检查进程状态." % (process_name, pheaders['pid']) + '\n'
"时间:" + alertime + '\n' +
"主机名:" + hostname + '\n' +
"ip:" + ip)
send_message(msg)
# echo RESULT
write_stdout('RESULT 2\nOK') # transition from READY to ACKNOWLEDGED
if __name__ == '__main__':
main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
上次更新: 8/28/2024
- 01
- GPT分区使用 parted 扩展分区的操作流程 原创08-28
- 02
- VictoriaMetrics 集群版安装与配置 原创08-24
- 03
- Kubernetes (k8s) 相关名词详解 原创06-27