/* ******************************************************************************* * Copyright (c) 1997 Martin Poole * ******************************************************************************* ** ** WARNING !! WARNING !! WARNING !! WARNING !! WARNING !! WARNING !! ** ** Any changes to be made to this file should first be checked with ** mplib1 source control for library integrity. ** ** mplib1 source control can be reached at mplib1@quatermass.co.uk ** * * $Source$ * $Author$ * $Date$ * $Revision$ * Purpose : Generic Program Watchdog * ******************************************************************************* * * Change History * * $Log$ * ******************************************************************************* */ #ident "$Header$" /* ------------------------------------------------------------------ Include files ------------------------------------------------------------------ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "include/bpo_init_internal.h" #include "include/bpo_q_internal.h" #include "include/bpo_proc_internal.h" #include #include "watchdog.h" /* ------------------------------------------------------------------ structures / defines ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ static data ------------------------------------------------------------------ */ static unsigned long proc_ser_nbr=0UL; static int track_debug=0; /* ------------------------------------------------------------------ Code starts here ------------------------------------------------------------------ */ static void free_all_running( struct program_hdr *pptr ) { dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)free, NULL ); dl_Init_List( &pptr->running_pids, 0 ); return; } struct pid_track * alloc_pid_track( struct program_hdr *pptr, int exp_time ) { struct pid_track *ptp; ptp = malloc( sizeof(struct pid_track) ); if (ptp) { ptp->the_pid = (pid_t)0; ptp->pid_str[0]='\0'; ptp->starting=1; ptp->dying=0; ptp->track_by_pid=0; ptp->pptr = pptr; (void)time( &ptp->expire ); exp_time = get_default_int( exp_time, "START_HYSTERESIS", 15 ); ptp->expire+=(time_t)exp_time; dl_Init_Node( &ptp->pid_node, ptp->pid_str, ptp ); dl_Add_Tail( &pptr->running_pids, &ptp->pid_node ); } return(ptp); } static int started( struct pid_track *ptp, struct pid_track **ptpp ) { int rv=0; if (ptp->starting) { /* This is it */ *ptpp = ptp; rv=1; } return(rv); } static struct pid_track * find_started_pid( struct program_hdr *pptr ) { struct pid_track *ptp=NULL; dl_Walk_List2( &pptr->running_pids, (dl_Walk_List2_t)started, &ptp ); return(ptp); } static int pid_has_started( struct program_hdr *pptr, pid_t the_pid, char *pid_str, int kill_state ) { struct pid_track *ptp; ptp = dl_Find_Item_By_Name( &pptr->running_pids, pid_str ); if (ptp==NULL) { /* find a started one, or create one */ ptp = find_started_pid( pptr ); if (ptp==NULL) ptp = alloc_pid_track( pptr, 0 ); if (ptp) { ptp->starting=0; ptp->the_pid = the_pid; strcpy( ptp->pid_str, pid_str ); ptp->dying = kill_state; fprintfile(stderr, "pid_has_started: %s pid %s\n", pptr->prg_name, pid_str ); }else { fprintfile(stderr, "pid_has_started: Unable to allocate pid_track for %s pid %s\n", pptr->prg_name, pid_str ); } } pptr->spawn_cnt=0; return(ptp!=NULL); } static int proc_list( struct shm_process_private *sppp, void *param1 ) { int ks; struct program_hdr *pptr; ks = is_pid_dead(sppp->pid); /* Now search for it, and update the relevant structure */ pptr=find_program_hdr( sppp->pid_nm ); if (pptr) { if (track_debug || ks) fprintfile(stderr, "Process %s pid:%d is %s\n", sppp->pid_nm, sppp->pid, ((ks)?"dead":"alive") ); pid_has_started( pptr, sppp->pid, sppp->pid_str, ks ); } else if ((track_debug || ks) && sppp->pid!=getpid()) { fprintfile(stderr, "Untracked process %d is %s\n", sppp->pid, ((ks)?"dead":"alive") ); } if (ks) return(SHM_DELETE_PROC); return(SHM_LEAVE_PROC); } static void pid_file_checks( struct program_hdr *pptr, void *vp ) { struct stat tstat; FILE *fh; char tbuf[100]; char *real_name; pid_t the_pid; int kill_state; if (pptr->pid_name && *pptr->pid_name) { real_name = eval_config_default( NULL, pptr->pid_name ); /* check timestamp */ if (stat(real_name,&tstat)==0) { /* file exists and can be checked */ if (tstat.st_mtime > pptr->pid_f_time) { pptr->pid_f_time = tstat.st_mtime; fh = fopen( real_name, "r" ); free_all_running( pptr ); if (fh) { fgetline(fh, tbuf, 100 ); the_pid = (pid_t)strtol(tbuf,NULL,0); gen_pid_str( tbuf, the_pid ); kill_state = kill( the_pid, 0 ); pid_has_started( pptr, the_pid, tbuf, kill_state ); fclose(fh); } } } } return; } static int up_spawn_cnt( struct program_hdr *pptr, time_t now ) { int rv=0,sl; if (now < pptr->spawn_window) { pptr->spawn_cnt++; } sl = get_default_int( 0, "SPAWN_LIMIT", 5 ); if (pptr->spawn_cnt>=sl) { sl = get_default_int( 0, "SPAWN_HOLD", 120 ); pptr->spawn_time = now + (time_t)sl; fprintfile( stderr, "Program %s restarting too often.\n", pptr->prg_name ); fprintfile( stderr, "Holding off for %d seconds\n", sl ); }else { /* Ok to start */ sl = get_default_int( 0, "SPAWN_WINDOW", 25 ); pptr->spawn_window = now + (time_t)sl; rv=1; } return(rv); } static void count_running( struct pid_track *ptp, int *rv ) { time_t now; int free_this=0; if (ptp->starting) { (void)time( &now ); if (now > ptp->expire) { fprintfile( stderr, "Program: %s has not started\n", ptp->pptr->prg_name ); free_this=1; }else { *rv = (*rv)+1; } }else if (!ptp->dying || (ptp->dying && ptp->pptr->exclusive)) { if ( is_pid_dead( ptp->the_pid ) ) { free_this=1; }else { *rv = (*rv)+1; } } if (free_this) { /* This has not required, so remove it */ dl_Remove_Node( &ptp->pid_node ); free( ptp ); } return; } static void kill_non_dying( struct pid_track *ptp, struct program_hdr *pptr ) { /* fprintfile( stderr, "kill_non_dying: %p %p (%ld/%s)\n",*/ /* ptp, pptr, ptp->the_pid, ptp->pid_str );*/ if (ptp->starting==0 && pptr->param1>0) { pptr->param1--; kill_pid( ptp->the_pid, pptr->param2, stderr ); ptp->dying=1; } return; } int kill_current_instances( struct program_hdr *pptr ) { /* Kill all non-dying */ pptr->param1 = 9999999; /* lots */ pptr->param2 = (pptr->pid_name)?0:1; dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)kill_non_dying, (void *)pptr ); return(0); } static int kill_some_program( struct program_hdr *pptr, int how_many ) { pptr->param1 = how_many; pptr->param2 = (pptr->pid_name)?0:1; dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)kill_non_dying, (void *)pptr ); return(0); } static void enough_programs( struct program_hdr *pptr, void *vp ) { int rv=0; if (pptr->active==0 || pptr->lvl_exclude) { /* Kill all non-dying */ kill_current_instances( pptr ); }else if (pptr->ptype==PRG_HDR_DAEMON) { dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)count_running, &rv ); if (rvcopies) { int maybe; time_t now; /* is backoff set ? */ (void)time( &now ); if (pptr->spawn_time!=(time_t)0) { if (now > pptr->spawn_time) { pptr->spawn_time = (time_t)0; pptr->spawn_cnt=0; maybe = up_spawn_cnt( pptr, now ); }else maybe=0; }else maybe = up_spawn_cnt( pptr, now ); if (maybe) start_some_program( pptr, pptr->copies-rv ); }else if (rv>pptr->copies) { kill_some_program( pptr, rv -pptr->copies ); } }else if (pptr->ptype==PRG_HDR_BATCH) { dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)count_running, &rv ); if (rv==0) { /* completed, set to inactive */ pptr->active = 0; } } return; } static void update_proc_list( const void *hint ) { struct cache_proc_list *cplp; if (hint) { cplp = Get_Cache_Procs( hint ); if (cplp && cplp->serial_nbr > proc_ser_nbr) { proc_ser_nbr = cplp->serial_nbr; Validate_Processes( hint, SHM_CHECK_ALL_PROCESSES, proc_list, NULL ); } } return; } static void report_deaths( struct pid_track *ptp, struct program_hdr *pptr ) { if ( !ptp->dying && !ptp->starting && is_pid_dead( ptp->the_pid ) ) { if (pptr->ptype==PRG_HDR_DAEMON) fprintfile( stderr, "Program: %s (%s) has unexpectedly died\n", pptr->prg_name, ptp->pid_str ); else fprintfile( stderr, "Program: %s (%s) has completed\n", pptr->prg_name, ptp->pid_str ); /* Looks like it */ dl_Remove_Node( &ptp->pid_node ); free( ptp ); } return; } static void body_count( struct program_hdr *pptr, void *vp ) { dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)report_deaths, pptr ); return; } int prog_track( const void *hint ) { /* So, in here we walk the proc list (if it has changed) for any program names that match those in our run list. If one does, we check for its presence in the running pid list, if it exists but has died we remove it, if it does not exist we search for a started running pid structure and add that, or create a suitable one. */ track_debug=get_config_flag("WATCH_TRACK_DEBUG"); update_proc_list( hint ); /* Having updated the running pids (from shm), we now check the pid_File varients */ walk_all_programs( pid_file_checks, NULL ); /* Now we check for bodies */ walk_all_programs( body_count, NULL ); /* Having updated all running programs in the lists we now check that enough programs are running */ walk_all_programs( enough_programs, NULL ); return(1); } static void count_starting( struct pid_track *ptp, int *rv ) { if (ptp->starting) { *rv = (*rv)+1; } return; } static void count_this_start( struct program_hdr *pptr, void *rv ) { dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)count_starting, rv ); return; } int count_all_starting( const void *hint ) { int rv=0; update_proc_list( hint ); walk_all_programs( count_this_start, &rv ); return(rv); } /* -- End of File -- */