Monitor.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. <?php
  2. require_once WORKERMAN_ROOT_DIR . 'Core/SocketWorker.php';
  3. /**
  4. *
  5. * 1、提供telnet接口,远程控制服务器查看服务状态
  6. * 2、监控主进程是否挂掉
  7. * 3、监控worker进程是否频繁退出
  8. * 4、定时清理log文件
  9. * 5、定时监控worker内存泄漏
  10. *
  11. * @author walkor <walkor@workerman.net>
  12. */
  13. class Monitor extends Man\Core\SocketWorker
  14. {
  15. /**
  16. * 一天有多少秒
  17. * @var integer
  18. */
  19. const SECONDS_ONE_DAY = 86400;
  20. /**
  21. * 多长时间清理一次磁盘日志文件
  22. * @var integer
  23. */
  24. const CLEAR_LOGS_TIME_LONG = 86400;
  25. /**
  26. * 多长时间检测一次master进程是否存在
  27. * @var integer
  28. */
  29. const CHECK_MASTER_PROCESS_TIME_LONG = 5;
  30. /**
  31. * 多长时间检查一次主进程状态
  32. * @var integer
  33. */
  34. const CHECK_MASTER_STATUS_TIME_LONG = 60;
  35. /**
  36. * 多长时间检查一次内存占用情况
  37. * @var integer
  38. */
  39. const CHECK_WORKER_MEM_TIME_LONG = 60;
  40. /**
  41. * 清理多少天前的日志文件
  42. * @var integer
  43. */
  44. const CLEAR_BEFORE_DAYS = 14;
  45. /**
  46. * 告警发送时间间隔
  47. * @var integer
  48. */
  49. const WARING_SEND_TIME_LONG = 300;
  50. /**
  51. * 大量worker进程退出
  52. * @var integer
  53. */
  54. const WARNING_TOO_MANY_WORKERS_EXIT = 1;
  55. /**
  56. * 主进程死掉
  57. * @var integer
  58. */
  59. const WARNING_MASTER_DEAD = 8;
  60. /**
  61. * worker占用内存限制 单位KB
  62. * @var integer
  63. */
  64. const DEFAULT_MEM_LIMIT = 83886;
  65. /**
  66. * 上次获得的主进程信息
  67. * [worker_name=>[0=>xx, 9=>xxx], worker_name=>[0=>xx]]
  68. * @var array
  69. */
  70. protected $lastMasterStatus = null;
  71. /**
  72. * 管理员认证信息
  73. * @var array
  74. */
  75. protected $adminAuth = array();
  76. /**
  77. * 最长的workerName
  78. * @var integer
  79. */
  80. protected $maxWorkerNameLength = 10;
  81. /**
  82. * 最长的Address
  83. * @var integer
  84. */
  85. protected $maxAddressLength = 20;
  86. /**
  87. * 上次发送告警的时间
  88. * @var array
  89. */
  90. protected static $lastWarningTimeMap = array(
  91. self::WARNING_TOO_MANY_WORKERS_EXIT => 0,
  92. self::WARNING_MASTER_DEAD => 0,
  93. );
  94. /**
  95. * 该进程开始服务
  96. * @see SocketWorker::start()
  97. */
  98. public function start()
  99. {
  100. // 安装信号
  101. $this->installSignal();
  102. // 初始化任务
  103. \Man\Core\Lib\Task::init($this->event);
  104. \Man\Core\Lib\Task::add(self::CLEAR_LOGS_TIME_LONG, array($this, 'clearLogs'), array(WORKERMAN_LOG_DIR));
  105. \Man\Core\Lib\Task::add(self::CHECK_MASTER_PROCESS_TIME_LONG, array($this, 'checkMasterProcess'));
  106. \Man\Core\Lib\Task::add(self::CHECK_MASTER_STATUS_TIME_LONG, array($this, 'checkMasterStatus'));
  107. \Man\Core\Lib\Task::add(self::CHECK_MASTER_STATUS_TIME_LONG, array($this, 'checkMemUsage'));
  108. // 添加accept事件
  109. $this->event->add($this->mainSocket, \Man\Core\Events\BaseEvent::EV_READ, array($this, 'onAccept'));
  110. // 主体循环
  111. $ret = $this->event->loop();
  112. }
  113. /**
  114. * 当有链接事件时触发
  115. * @param resource $socket
  116. * @param null $null_one
  117. * @param null $null_two
  118. * @return void
  119. */
  120. public function onAccept($socket, $null_one = null, $null_two = null)
  121. {
  122. $fd = $this->accept($socket, $null_one , $null_two);
  123. if($fd)
  124. {
  125. $this->currentDealFd = (int)$fd;
  126. if($this->getRemoteIp() != '127.0.0.1')
  127. {
  128. $this->sendToClient("Password\n");
  129. }
  130. else
  131. {
  132. $this->adminAuth[$this->currentDealFd] = time();
  133. $this->sendToClient("Hello admin\n");
  134. }
  135. }
  136. }
  137. /**
  138. * 确定包是否完整
  139. * @see Worker::dealInput()
  140. */
  141. public function dealInput($recv_buffer)
  142. {
  143. return 0;
  144. }
  145. /**
  146. * 处理业务
  147. * @see Worker::dealProcess()
  148. */
  149. public function dealProcess($buffer)
  150. {
  151. $buffer = trim($buffer);
  152. $ip = $this->getRemoteIp();
  153. if($ip != '127.0.0.1' && $buffer == 'status')
  154. {
  155. \Man\Core\Lib\Log::add("IP:$ip $buffer");
  156. }
  157. // 判断是否认证过
  158. $this->adminAuth[$this->currentDealFd] = !isset($this->adminAuth[$this->currentDealFd]) ? 0 : $this->adminAuth[$this->currentDealFd];
  159. if($this->adminAuth[$this->currentDealFd] < 3)
  160. {
  161. if($buffer != \Man\Core\Lib\Config::get($this->workerName.'.password'))
  162. {
  163. if(++$this->adminAuth[$this->currentDealFd] >= 3)
  164. {
  165. $this->sendToClient("Password Incorrect \n");
  166. $this->closeClient($this->currentDealFd);
  167. return;
  168. }
  169. $this->sendToClient("Please Try Again\n");
  170. return;
  171. }
  172. else
  173. {
  174. $this->adminAuth[$this->currentDealFd] = time();
  175. $this->sendToClient("Hello Admin \n");
  176. return;
  177. }
  178. }
  179. // 单独停止某个worker进程
  180. if(preg_match("/kill (\d+)/", $buffer, $match))
  181. {
  182. $pid = $match[1];
  183. $this->sendToClient("Kill Pid $pid\n");
  184. if(!posix_kill($pid, SIGHUP))
  185. {
  186. $this->sendToClient("Pid Not Exsits\n");
  187. }
  188. return;
  189. }
  190. $master_pid = file_get_contents(WORKERMAN_PID_FILE);
  191. switch($buffer)
  192. {
  193. // 展示统计信息
  194. case 'status':
  195. $status = $this->getMasterStatus();
  196. if(empty($status))
  197. {
  198. $this->sendToClient("Can not get Master status, Extension sysvshm or sysvmsg may not enabled\n");
  199. return;
  200. }
  201. $worker_pids = $this->getWorkerPidMap();
  202. $pid_worker_name_map = $this->getPidWorkerMap();
  203. foreach($worker_pids as $worker_name=>$pid_array)
  204. {
  205. if($this->maxWorkerNameLength < strlen($worker_name))
  206. {
  207. $this->maxWorkerNameLength = strlen($worker_name);
  208. }
  209. }
  210. foreach(\Man\Core\Lib\Config::getAllWorkers() as $worker_name=>$config)
  211. {
  212. if(!isset($config['listen']))
  213. {
  214. continue;
  215. }
  216. if($this->maxAddressLength < strlen($config['listen']))
  217. {
  218. $this->maxAddressLength = strlen($config['listen']);
  219. }
  220. }
  221. $msg_type = $message = 0;
  222. // 将过期的消息读出来,清理掉
  223. if(\Man\Core\Master::getQueueId())
  224. {
  225. while(@msg_receive(\Man\Core\Master::getQueueId(), self::MSG_TYPE_STATUS, $msg_type, 1000, $message, true, MSG_IPC_NOWAIT))
  226. {
  227. }
  228. }
  229. $loadavg = sys_getloadavg();
  230. $this->sendToClient("---------------------------------------GLOBAL STATUS--------------------------------------------\n");
  231. $this->sendToClient(\Man\Core\Master::NAME.' version:' . \Man\Core\Master::VERSION . "\n");
  232. $this->sendToClient('start time:'. date('Y-m-d H:i:s', $status['start_time']).' run ' . floor((time()-$status['start_time'])/(24*60*60)). ' days ' . floor(((time()-$status['start_time'])%(24*60*60))/(60*60)) . " hours \n");
  233. $this->sendToClient('load average: ' . implode(", ", $loadavg) . "\n");
  234. $this->sendToClient(count($this->connections) . ' users ' . count($worker_pids) . ' workers ' . count($pid_worker_name_map)." processes\n");
  235. $this->sendToClient(str_pad('worker_name', $this->maxWorkerNameLength) . " exit_status exit_count\n");
  236. foreach($worker_pids as $worker_name=>$pid_array)
  237. {
  238. if(isset($status['worker_exit_code'][$worker_name]))
  239. {
  240. foreach($status['worker_exit_code'][$worker_name] as $exit_status=>$exit_count)
  241. {
  242. $this->sendToClient(str_pad($worker_name, $this->maxWorkerNameLength) . " " . str_pad($exit_status, 16). " $exit_count\n");
  243. }
  244. }
  245. else
  246. {
  247. $this->sendToClient(str_pad($worker_name, $this->maxWorkerNameLength) . " " . str_pad(0, 16). " 0\n");
  248. }
  249. }
  250. $this->sendToClient("---------------------------------------PROCESS STATUS-------------------------------------------\n");
  251. $this->sendToClient("pid\tmemory ".str_pad(' listening', $this->maxAddressLength)." timestamp ".str_pad('worker_name', $this->maxWorkerNameLength)." ".str_pad('total_request', 13)." ".str_pad('packet_err', 10)." ".str_pad('thunder_herd', 12)." ".str_pad('client_close', 12)." ".str_pad('send_fail', 9)." ".str_pad('throw_exception', 15)." suc/total\n");
  252. if(!\Man\Core\Master::getQueueId())
  253. {
  254. return;
  255. }
  256. $time_start = time();
  257. unset($pid_worker_name_map[posix_getpid()]);
  258. $total_worker_count = count($pid_worker_name_map);
  259. foreach($pid_worker_name_map as $pid=>$worker_name)
  260. {
  261. posix_kill($pid, SIGUSR1);
  262. if($this->getStatusFromQueue())
  263. {
  264. $total_worker_count--;
  265. }
  266. }
  267. while($total_worker_count > 0)
  268. {
  269. if($this->getStatusFromQueue())
  270. {
  271. $total_worker_count--;
  272. }
  273. if(time() - $time_start > 1)
  274. {
  275. break;
  276. }
  277. }
  278. break;
  279. // 停止server
  280. case 'stop':
  281. if($master_pid)
  282. {
  283. $this->sendToClient("stoping....\n");
  284. posix_kill($master_pid, SIGINT);
  285. }
  286. else
  287. {
  288. $this->sendToClient("Can not get master pid\n");
  289. }
  290. break;
  291. // 平滑重启server
  292. case 'reload':
  293. $pid_worker_name_map = $this->getPidWorkerMap();
  294. unset($pid_worker_name_map[posix_getpid()]);
  295. if($pid_worker_name_map)
  296. {
  297. foreach($pid_worker_name_map as $pid=>$item)
  298. {
  299. posix_kill($pid, SIGHUP);
  300. }
  301. $this->sendToClient("Restart Workers\n");
  302. }
  303. else
  304. {
  305. if($master_pid)
  306. {
  307. posix_kill($master_pid, SIGHUP);
  308. $this->sendToClient("Restart Workers\n");
  309. }
  310. else
  311. {
  312. $this->sendToClient("Can not get master pid\n");
  313. }
  314. }
  315. break;
  316. // admin管理员退出
  317. case 'quit':
  318. $this->sendToClient("Admin Quit\n");
  319. $this->closeClient($this->currentDealFd);
  320. break;
  321. case '':
  322. break;
  323. default:
  324. $this->sendToClient("Unkonw CMD \nAvailable CMD:\n status show server status\n stop stop server\n reload graceful restart server\n quit quit and close connection\n kill pid kill the worker process of the pid\n");
  325. }
  326. }
  327. /**
  328. * 从消息队列中获取主进程状态
  329. * @return void
  330. */
  331. protected function getStatusFromQueue()
  332. {
  333. if(@msg_receive(\Man\Core\Master::getQueueId(), self::MSG_TYPE_STATUS, $msg_type, 10000, $message, true, MSG_IPC_NOWAIT))
  334. {
  335. $pid = $message['pid'];
  336. $worker_name = $message['worker_name'];
  337. $address = \Man\Core\Lib\Config::get($worker_name . '.listen');
  338. if(!$address)
  339. {
  340. $address = 'none';
  341. }
  342. $str = "$pid\t".str_pad(round($message['memory']/(1024*1024),2)."M", 7)." " .str_pad($address,$this->maxAddressLength) ." ". $message['start_time'] ." ".str_pad($worker_name, $this->maxWorkerNameLength)." ";
  343. if($message)
  344. {
  345. $str = $str . str_pad($message['total_request'], 14)." ".str_pad($message['packet_err'],10)." ".str_pad($message['thunder_herd'],12)." ".str_pad($message['client_close'], 12)." ".str_pad($message['send_fail'],9)." ".str_pad($message['throw_exception'],15)." ".($message['total_request'] == 0 ? 100 : (round(($message['total_request']-($message['packet_err']+$message['send_fail']))/$message['total_request'], 6)*100))."%";
  346. }
  347. else
  348. {
  349. $str .= var_export($message, true);
  350. }
  351. $this->sendToClient($str."\n");
  352. return true;
  353. }
  354. return false;
  355. }
  356. /**
  357. * 清理日志目录
  358. * @param string $dir
  359. * @return void
  360. */
  361. public function clearLogs($dir)
  362. {
  363. $time_now = time();
  364. foreach(glob($dir."/20*-*-*") as $file)
  365. {
  366. if(!is_dir($file)) continue;
  367. $base_name = basename($file);
  368. $log_time = strtotime($base_name);
  369. if($log_time === false) continue;
  370. if(($time_now - $log_time)/self::SECONDS_ONE_DAY >= self::CLEAR_BEFORE_DAYS)
  371. {
  372. $this->recursiveDelete($file);
  373. }
  374. }
  375. }
  376. /**
  377. * 检测主进程是否存在
  378. * @return void
  379. */
  380. public function checkMasterProcess()
  381. {
  382. $master_pid = \Man\Core\Master::getMasterPid();
  383. if(!posix_kill($master_pid, 0))
  384. {
  385. $this->onMasterDead();
  386. }
  387. }
  388. /**
  389. * 主进程挂掉会触发
  390. * @return void
  391. */
  392. protected function onMasterDead()
  393. {
  394. // 不要频繁告警,5分钟告警一次
  395. $time_now = time();
  396. if($time_now - self::$lastWarningTimeMap[self::WARNING_MASTER_DEAD] < self::WARING_SEND_TIME_LONG)
  397. {
  398. return;
  399. }
  400. // 延迟告警,启动脚本kill掉主进程不告警,该进程也会随之kill掉
  401. sleep(5);
  402. $ip = $this->getIp();
  403. $this->sendSms('告警消息 WorkerMan框架监控 ip:'.$ip.' 主进程意外退出');
  404. // 记录这次告警时间
  405. self::$lastWarningTimeMap[self::WARNING_MASTER_DEAD] = $time_now;
  406. }
  407. /**
  408. * 检查主进程状态统计信息
  409. * @return void
  410. */
  411. public function checkMasterStatus()
  412. {
  413. $status = $this->getMasterStatus();
  414. if(empty($status))
  415. {
  416. $this->notice("can not get master status" , false);
  417. return;
  418. }
  419. $status = $status['worker_exit_code'];
  420. if(null === $this->lastMasterStatus)
  421. {
  422. $this->lastMasterStatus = $status;
  423. return;
  424. }
  425. $max_worker_exit_count = (int)\Man\Core\Lib\Config::get($this->workerName.".max_worker_exit_count");
  426. if($max_worker_exit_count <= 0)
  427. {
  428. $max_worker_exit_count = 2000;
  429. }
  430. foreach($status as $worker_name => $code_count_info)
  431. {
  432. foreach($code_count_info as $code=>$count)
  433. {
  434. $last_count = isset($this->lastMasterStatus[$worker_name][$code]) ? $this->lastMasterStatus[$worker_name][$code] : 0;
  435. $inc_count = $count - $last_count;
  436. if($inc_count >= $max_worker_exit_count)
  437. {
  438. $this->onTooManyWorkersExits($worker_name, $code, $inc_count);
  439. }
  440. }
  441. }
  442. $this->lastMasterStatus = $status;
  443. }
  444. /**
  445. * 检查worker进程是否有严重的内存泄漏
  446. * @return void
  447. */
  448. public function checkMemUsage()
  449. {
  450. foreach($this->getPidWorkerMap() as $pid=>$worker_name)
  451. {
  452. $this->checkWorkerMemByPid($pid, $worker_name);
  453. }
  454. }
  455. /**
  456. * 根据进程id收集进程内存占用情况
  457. * @param int $pid
  458. * @return void
  459. */
  460. protected function checkWorkerMemByPid($pid, $worker_name)
  461. {
  462. $mem_limit = \Man\Core\Lib\Config::get($this->workerName.'.max_mem_limit');
  463. if(!$mem_limit)
  464. {
  465. $mem_limit = self::DEFAULT_MEM_LIMIT;
  466. }
  467. // 读取系统对该进程统计的信息
  468. $status_file = "/proc/$pid/status";
  469. if(is_file($status_file))
  470. {
  471. // 获取信息
  472. $status = file_get_contents($status_file);
  473. if(empty($status))
  474. {
  475. return;
  476. }
  477. // 目前只需要进程的内存占用信息
  478. $match = array();
  479. if(preg_match('/VmRSS:\s+(\d+)\s+([a-zA-Z]+)/', $status, $match))
  480. {
  481. $memory_usage = $match[1];
  482. if($memory_usage >= $mem_limit)
  483. {
  484. posix_kill($pid, SIGHUP);
  485. $this->notice("worker:$worker_name pid:$pid memory exceeds the maximum $memory_usage>=$mem_limit");
  486. }
  487. }
  488. }
  489. }
  490. /**
  491. * 当有大量进程频繁退出时触发
  492. * @param string $worker_name
  493. * @param int $status
  494. * @param int $exit_count
  495. * @return void
  496. */
  497. public function onTooManyWorkersExits($worker_name, $status, $exit_count)
  498. {
  499. // 不要频繁告警,5分钟告警一次
  500. $time_now = time();
  501. if($time_now - self::$lastWarningTimeMap[self::WARNING_TOO_MANY_WORKERS_EXIT] < self::WARING_SEND_TIME_LONG)
  502. {
  503. return;
  504. }
  505. $ip = $this->getIp();
  506. if(65280 == $status || 30720 == $status)
  507. {
  508. $this->sendSms('告警消息 Workerman框架监控 '.$ip.' '.$worker_name.'5分钟内出现 FatalError '.$exit_count.'次 时间:'.date('Y-m-d H:i:s'));
  509. }
  510. else
  511. {
  512. $this->sendSms('告警消息 Workerman框架监控 '.$ip.' '.$worker_name.' 进程频繁退出 退出次数'.$exit_count.' 退出状态码:'.$status .' 时间:'.date('Y-m-d H:i:s'));
  513. }
  514. // 记录这次告警时间
  515. self::$lastWarningTimeMap[self::WARNING_TOO_MANY_WORKERS_EXIT] = $time_now;
  516. }
  517. /**
  518. * 发送短信
  519. * @param int $phone_num
  520. * @param string $content
  521. * @return void
  522. */
  523. protected function sendSms($content)
  524. {
  525. // 短信告警
  526. }
  527. /**
  528. * 获取本地ip
  529. * @param string $worker_name
  530. * @return string
  531. */
  532. public function getIp($worker_name = '')
  533. {
  534. $ip = $this->getLocalIp();
  535. if(empty($ip) || $ip == '0.0.0.0' || $ip = '127.0.0.1')
  536. {
  537. if($worker_name)
  538. {
  539. $ip = \Man\Core\Lib\Config::get($worker_name . '.ip');
  540. }
  541. if(empty($ip) || $ip == '0.0.0.0' || $ip = '127.0.0.1')
  542. {
  543. $ret_string = shell_exec('ifconfig');
  544. if(preg_match("/:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/", $ret_string, $match))
  545. {
  546. $ip = $match[1];
  547. }
  548. }
  549. }
  550. return $ip;
  551. }
  552. /**
  553. * 递归删除文件
  554. * @param string $path
  555. */
  556. private function recursiveDelete($path)
  557. {
  558. return is_file($path) ? unlink($path) : array_map(array($this, 'recursiveDelete'),glob($path.'/*')) == rmdir($path);
  559. }
  560. }