/* ******************************************************************************* * Copyright (c) 1997 Martin Poole * ******************************************************************************* ** ** WARNING !! WARNING !! WARNING !! WARNING !! WARNING !! WARNING !! ** ** Any changes to be made to this file should first be checked with ** mplib1 source control for library integrity. ** ** mplib1 source control can be reached at mplib1@quatermass.co.uk ** * * System : * Subsystem : * Module : * $Source$ * $Author$ * $Date$ * $Revision$ * Purpose : * ******************************************************************************* * * Change History * * $Log$ * ******************************************************************************* */ #ident "$Header$" /* ------------------------------------------------------------------ Include files ------------------------------------------------------------------ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "watchdog.h" /* ------------------------------------------------------------------ structures / defines ------------------------------------------------------------------ */ enum comm_actions { CA_SHUTDOWN = 1, CA_INCLUDE, CA_EXCLUDE, CA_CREATE, CA_UPDATE, CA_FORGET, CA_DELETE, CA_KILL, CA_LIST, CA_TICKLE, CA_WAIT, CA_DUMP, CA_LOAD, CA_SAVE, CA_SLEEP, CA_SIGNAL, CA_RESTART, CA_RUN, CA_BOUNCE, CA_CLEANUP, CA_DEBUG, }; enum wait_actions { WA_KILL = 1, WA_START, WA_TILL, WA_BATCH, }; struct wait_job { dl_Node_t wj_Node; int wait_for; time_t wait_till; struct watchdog_ctrl_msg *cmp; struct program_hdr *pptr; }; /* ------------------------------------------------------------------ static data ------------------------------------------------------------------ */ static char ctrl_q_name[]=WATCH_CTRL_Q; static struct match_token ctrl_comms[] = { { CA_SHUTDOWN, "^shutdown please" }, { CA_INCLUDE, "^include" }, { CA_EXCLUDE, "^exclude" }, { CA_CREATE, "^create" }, { CA_UPDATE, "^update" }, { CA_FORGET, "^forget" }, { CA_DELETE, "^delete" }, { CA_KILL, "^kill" }, { CA_LIST, "^list" }, { CA_TICKLE, "^tickle" }, { CA_WAIT, "^wait" }, { CA_DUMP, "^dump" }, { CA_LOAD, "^load" }, { CA_SLEEP, "^sleep" }, { CA_SIGNAL, "^signal" }, { CA_RESTART, "^restart" }, { CA_RUN, "^run" }, { CA_BOUNCE, "^bounce" }, { CA_CLEANUP, "^cleanup" }, { CA_DEBUG, "^debug" }, { 0, NULL } }; static struct match_token wait_comms[] = { { WA_KILL, "^kill" }, { WA_START, "^start" }, { WA_TILL, "^till" }, { WA_BATCH, "^batch" }, { 0, NULL } }; static struct match_token signal_comms[] = { { SIGHUP, "^HUP" }, { SIGHUP, "^hup" }, { SIGSEGV, "^SEGV" }, { SIGSEGV, "^segv" }, { SIGUSR1, "^USR1" }, { SIGUSR1, "^usr1" }, { SIGUSR2, "^USR2" }, { SIGUSR2, "^usr2" }, { 0, NULL } }; static int watch_ctrl_debug=0; static int watch_wait_debug=0; static int the_signal; static dl_List_t wait_list; static const char dt_tm_iso8601[]="%Y-%m-%d %H:%M:%SZ"; /* ------------------------------------------------------------------ Code starts here ------------------------------------------------------------------ */ static const char * signal_name( int the_signal ) { struct match_token *mptr; mptr = signal_comms; while( mptr->m_val && mptr->m_val!=the_signal) mptr++; return(mptr->m_ptr); } int create_watch_queues( const void *comms_hint ) { /* Do all those thing we're supposed to do */ if(mpCreateQ( comms_hint, ctrl_q_name, 0, Q_SOLE_READER, Q_NM_PIPE )==NULL) { fprintfile(stderr,"Unable to create control queue.\n" ); return(1); } dl_Init_List( &wait_list, 0 ); return(0); } static struct watchdog_ctrl_msg * send_back_control_msg( struct watchdog_ctrl_msg *cmp ) { char *parsep; if (cmp) { parsep = cmp->rep_buf + strlen(cmp->rep_buf); if (parsep>cmp->rep_buf) { parsep--; if (*parsep =='\n') *parsep = '\0'; else parsep=NULL; }else parsep = NULL; fprintfile(stderr,"Control: Reply: <%s>\n", cmp->rep_buf ); if (parsep) *parsep='\n'; if (mpReplyMsg( &cmp->ctrl_msg )>0) cmp = NULL; } return(cmp); } static void free_ctrl_msg( struct watchdog_ctrl_msg *cmp ) { if (cmp) { /* Remove it from the pid resource list */ bpo_Remove_Node( &cmp->ctrl_msg.Q_Node ); /* and free it */ shfree( cmp ); } } static void free_wait_job( struct wait_job *wjp ) { dl_Remove_Node( &wjp->wj_Node ); if (wjp->cmp) { free_ctrl_msg( wjp->cmp ); } free( wjp ); return; } int delete_watch_queues( const void *comms_hint ) { dl_Walk_List( &wait_list, (dl_Walk_List_t)free_wait_job, NULL ); return(0); } static void check_wait_job( struct wait_job *wjp ) { int done=0; time_t now; switch( wjp->wait_for ) { case WA_KILL : /* Wait For kill list to reach 0 entries */ if (check_kills( stderr )==0) { sprintf( wjp->cmp->rep_buf, "Kill list empty\n" ); done=1; }else if (watch_wait_debug) { fprintfile( stderr, "waiting for kill list to empty\n" ); } break; case WA_START : /* wait for starting entries to be purged, or filled */ if (count_all_starting( wjp->cmp )==0) { sprintf( wjp->cmp->rep_buf, "Job starting list empty\n" ); done=1; }else if (watch_wait_debug) { fprintfile( stderr, "waiting for start list to complete\n" ); } break; case WA_TILL : /* timed out yet ? */ (void)time( &now ); if (nowwait_till) { sprintf( wjp->cmp->rep_buf, "Wakey, Wakey!\n" ); done=1; }else if (watch_wait_debug) { fprintfile( stderr, "waiting for timeout\n" ); } break; case WA_BATCH : /* Has job completed */ if ( dl_Any_In_List( &wjp->pptr->running_pids )==0) { sprintf( wjp->cmp->rep_buf, "Program %s has completed\n", wjp->pptr->prg_name ); done=1; }else if (watch_wait_debug) { fprintfile( stderr, "waiting for job to finish (%s)\n", wjp->pptr->prg_name ); } break; default : sprintf( wjp->cmp->rep_buf, "Unrecognised wait command\n" ); done=1; break; } if (done) { wjp->cmp = send_back_control_msg( wjp->cmp ); free_wait_job( wjp ); } return; } static void check_all_wait_jobs( void ) { watch_wait_debug = get_config_flag("WATCH_WAIT_DEBUG"); /* Check each job on the wait list to see if it is complete */ dl_Walk_List( &wait_list, (dl_Walk_List_t)check_wait_job, NULL ); return; } static struct watchdog_ctrl_msg * add_wait_job( struct watchdog_ctrl_msg *cmp, int action, const char *details ) { struct wait_job *wjp; int add_this=1; wjp = malloc( sizeof( struct wait_job ) ); if (wjp) { dl_Init_Node( &wjp->wj_Node, NULL, wjp ); wjp->cmp = cmp; wjp->wait_for = action; switch( action ) { case WA_TILL : /* read how long */ (void)time( &wjp->wait_till ); wjp->wait_till += (time_t)atoi(details); break; case WA_BATCH : /* remember this program */ wjp->pptr = find_program_hdr( details ); if (wjp->pptr==NULL) add_this=0; break; } if (add_this) { dl_Add_Tail( &wait_list, &wjp->wj_Node ); check_wait_job( wjp ); cmp = NULL; } } return(cmp); } static int set_lvl_state( const char *lvl_nm, int active ) { struct level_hdr *lptr; lptr = find_level_hdr( lvl_nm ); if (lptr) { lptr->active=active; save_lvl_hdr( lptr ); set_all_program_levels(); } return(lptr!=NULL); } static int level_incl( const char *lvl_nm, char *cp, int db_save ) { if (watch_ctrl_debug) fprintfile(stderr,"level_incl: %s\n", lvl_nm ); return(set_lvl_state( lvl_nm, 0 )); } static int level_excl( const char *lvl_nm, char *cp, int db_save ) { if (watch_ctrl_debug) fprintfile(stderr,"level_excl: %s\n", lvl_nm ); return(set_lvl_state( lvl_nm, 1 )); } int exclude_named_level( const char *lvl_nm ) { if (watch_ctrl_debug) fprintfile(stderr,"exclude_named_level: %s\n", lvl_nm ); return(set_lvl_state( lvl_nm, 1 )); } static int set_prg_state( const char *prg_nm, int active ) { struct program_hdr *pptr; pptr = find_program_hdr( prg_nm ); if (pptr) { pptr->active=active; save_prg_hdr( pptr ); } return(pptr!=NULL); } static int program_incl( const char *prg_nm, char *cp, int db_save, int ptype ) { if (watch_ctrl_debug) fprintfile(stderr,"program_incl: %s\n", prg_nm ); return(set_prg_state( prg_nm, 1 )); } static int program_excl( const char *prg_nm, char *cp, int db_save, int ptype ) { if (watch_ctrl_debug) fprintfile(stderr,"program_excl: %s\n", prg_nm ); return(set_prg_state( prg_nm, 0 )); } static int signal_this( struct pid_track *ptp, int *the_signal ) { fprintfile( stderr, "sending %s to %s\n", signal_name(*the_signal), ptp->pid_str ); kill( ptp->the_pid, *the_signal ); return(0); } static void signal_pptr( struct program_hdr *pptr ) { if (pptr) dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)signal_this, (void *)&the_signal ); return; } static int program_signal( const char *prg_nm, char *cp, int db_save, int ptype ) { struct program_hdr *pptr; pptr = find_program_hdr( prg_nm ); signal_pptr( pptr ); return(pptr!=NULL); } static int level_signal( const char *lvl_nm, char *cp, int db_save ) { struct level_hdr *lptr; lptr = find_level_hdr( lvl_nm ); if (lptr) { walk_all_programs_in_level( lptr, (void(*)(struct program_hdr *,void *))signal_pptr, NULL ); } return(lptr!=NULL); } static int signal_things( int wait_signal, char *parsep ) { int rv; the_signal = wait_signal; rv = parse_this_line( parsep, level_signal, program_signal, 0 ); return(rv); } static int forget_level( const char *lvl_nm, char *active, int db_save ) { struct level_hdr *lptr; if (watch_ctrl_debug) fprintfile(stderr,"forget_level: %s\n", lvl_nm ); lptr = find_level_hdr( lvl_nm ); if (lptr) { if (delete_lvl_hdr(lptr)==0) free_level(lptr); } return(lptr!=NULL); } static int level_bounce( const char *lvl_nm, char *cp, int db_save ) { struct level_hdr *lptr; lptr = find_level_hdr( lvl_nm ); if (watch_ctrl_debug) fprintfile(stderr,"bounce level: %s (%p)\n", lvl_nm, lptr ); if (lptr) { walk_all_programs_in_level( lptr, (void(*)(struct program_hdr *pptr, void *))kill_current_instances, NULL ); } return(lptr!=NULL); } static int program_bounce( const char *prg_nm, char *active, int db_save, int ptype ) { struct program_hdr *pptr; pptr = find_program_hdr( prg_nm ); if (watch_ctrl_debug) fprintfile(stderr,"bounce program: %s\n", prg_nm, pptr ); if (pptr) { kill_current_instances( pptr ); } return(pptr!=NULL); } static int try_killing( char *parsep ) { int rv=0; pid_t kpid; while ( (kpid = (pid_t)strtol( parsep, &parsep, 10 )) && kpid!=getpid() ) { kill_pid( kpid, 1, stderr ); /* assume in shared */ rv++; } return(rv); } static int reply_control_msg( struct watchdog_ctrl_msg *cmp ) { cmp = send_back_control_msg( cmp ); if (cmp) { free_ctrl_msg( cmp ); } return(0); } int dump_db_to_file( const char *fname ) { FILE *fp; int rv=0; char tstr[50]; fp=fopen(fname,"w"); if (fp) { make_timestamp_str( tstr ); fprintf( fp, "#Watchdog state file dump - %s\n", tstr ); fprintf( fp, "#\n#Programs\n#\n" ); walk_all_programs( (void (*)(struct program_hdr *,void *))print_prg_hdr, (void *)fp ); fprintf( fp, "\n#\n#Levels\n#\n" ); walk_all_levels( (void (*)(struct level_hdr *,void *))print_lvl_hdr, fp ); fprintf( fp, "\n\n#\n#End Of File\n" ); fclose(fp); rv=1; } return(rv); } int read_file_into_db( const char *fname ) { int rv=0; char tbuf[500]; FILE *fh; if ( (fh=fopen(fname,"r")) ) { while(fgetline2(fh,tbuf,500)==0) { /* might mean something */ parse_watch_line( tbuf ); }; fclose(fh); rv=1; } return(rv); } static int forget_program( const char *prg_nm, char *active, int db_save, int ptype ) { struct program_hdr *pptr; if (watch_ctrl_debug) fprintfile(stderr,"forget_program: %s\n", prg_nm ); pptr = find_program_hdr( prg_nm ); if (pptr) { if (delete_prg_hdr(pptr)==0) free_program(pptr); } return(pptr!=NULL); } static int run_this_batch( const char *prg_name ) { struct program_hdr *pptr; int rv=0; pptr = find_program_hdr( prg_name ); if (pptr && pptr->ptype == PRG_HDR_BATCH && pptr->active==0) { pptr->active=1; start_some_program( pptr, pptr->copies ); rv=1; } return(rv); } static void pid_details( struct pid_track *ptp, FILE *fp ) { char pid_str[PID_STR_LEN]; char tbuf[40]; struct tm *tmp; gen_pid_str( pid_str, ptp->the_pid ); fprintf( fp, " pid: <%s> pids: <%s>\n", pid_str, ptp->pid_str ); tmp = gmtime( &ptp->expire ); strftime( tbuf, 40, dt_tm_iso8601, tmp ); fprintf( fp, " starting: %d dying: %d expire: %s t_by_pid: %d\n", ptp->starting, ptp->dying, tbuf, ptp->track_by_pid ); return; } static void print_prg_debug( struct program_hdr *pptr, FILE *fp ) { char tbuf[40]; struct tm *tmp; /* Now print all the details on this sucker */ fprintf( fp, "Program: <%s> (search: <%s>)\n", pptr->prg_name, pptr->key ); fprintf( fp, " Group: <%s> Exe: <%s> Opt: <%s>\n", pptr->grp_name, pptr->exe_name, pptr->opt_name ); fprintf( fp, " pid: <%s> env: <%s> out: <%s> err: <%s>\n", pptr->pid_name, pptr->env_name, pptr->out_name, pptr->err_name ); fprintf( fp, " ptype: %d level: %d copies: %d active: %d\n", pptr->ptype, pptr->level, pptr->copies, pptr->active ); fprintf( fp, " t_by_pid: %d s_up: %d sp_l: %d sp_h: %d sp_w: %d\n", pptr->track_by_pid, pptr->startup, pptr->sp_limit, pptr->sp_hold, pptr->sp_window ); tmp = gmtime( &pptr->pid_f_time ); strftime( tbuf, 40, dt_tm_iso8601, tmp ); fprintf( fp, " f_time: %s lvl_excl: %d spw_cnt: %d\n", tbuf, pptr->lvl_exclude, pptr->spawn_cnt ); tmp = gmtime( &pptr->spawn_time ); strftime( tbuf, 40, dt_tm_iso8601, tmp ); fprintf( fp, " spw_t: %s spw_w: %d\n", tbuf, (int)pptr->spawn_window ); dl_Walk_List( &pptr->running_pids, (dl_Walk_List_t)pid_details, fp ); fprintf( fp, "\n" ); return; } static void watchdog_debug( char *debug_line, char *reply ) { walk_all_programs( (void(*)(struct program_hdr *,void *))print_prg_debug, stderr ); strcpy( reply, "Debug performed" ); return; } static int process_line( char *parsep, level_func_t level_func, prog_func_t prog_func, int db_save, char *rbuf, const char *OK_str, const char *BAD_str, const char *info_str ) { int rv; rv = parse_this_line( parsep, level_func, prog_func, db_save ); if (rv) { sprintf( rbuf, "%s %s\n", OK_str, info_str ); }else { sprintf( rbuf, "%s %s\n", BAD_str, info_str ); } return(rv); } int watch_ctrl( const void *comms_hint ) { struct watchdog_ctrl_msg *cmp; char *parsep,*dupep; int rv=0,wl; watch_ctrl_debug = get_config_flag("WATCH_CTRL_DEBUG"); check_all_wait_jobs( ); while ( (cmp = mpGetMsg( ctrl_q_name, comms_hint )) ) { fprintfile(stderr,"Control: Message: <%s>\n", cmp->comm_buf ); wl = match_a_string( cmp->comm_buf, ctrl_comms, &parsep ); /* Now dupe the remaining string for error purposes */ dupep = strdup( parsep ); switch( wl ) { case CA_SHUTDOWN : /* sutdown */ sprintf( cmp->rep_buf, "Shutting Down\n" ); rv=1; break; case CA_INCLUDE : /* include */ process_line( parsep, level_incl, program_incl, 0, cmp->rep_buf, "Including", "Unknown", dupep ); break; case CA_EXCLUDE : /* exclude */ process_line( parsep, level_excl, program_excl, 0, cmp->rep_buf, "Excluding", "Unknown", dupep ); break; case CA_CREATE : /* create */ sprintf( cmp->rep_buf, "Creating %s\n", parsep ); parse_watch_line( parsep ); break; case CA_UPDATE : /* create */ sprintf( cmp->rep_buf, "Updating %s\n", parsep ); parse_watch_line( parsep ); break; case CA_FORGET : /* forget */ process_line( parsep, forget_level, forget_program, 0, cmp->rep_buf, "Forgetting", "Unknown", dupep ); break; case CA_KILL : /* kill */ /* Kill the supplied pids ? */ if (try_killing( parsep )) { sprintf( cmp->rep_buf, "Pid entered into kill list\n" ); }else { sprintf( cmp->rep_buf, "Cannot kill pid requested\n" ); } break; case CA_LIST : /* List all details */ break; case CA_TICKLE : /* tickle */ sprintf( cmp->rep_buf, "Oooohhh, that tickles\n" ); break; case CA_WAIT : /* wait for something to complete */ wl = match_a_string( parsep, wait_comms, &parsep ); if (wl) { cmp = add_wait_job( cmp, wl, parsep ); }else { sprintf( cmp->rep_buf, "Unknown wait command - %s\n", parsep ); } break; case CA_DUMP : /* Dump contents to file */ sprintf( cmp->rep_buf, "Dump to file %s - %s\n", parsep, (const char *)(dump_db_to_file( parsep ) ?"Succeeded":"Failed") ); break; case CA_LOAD : /* Load contents from file */ sprintf( cmp->rep_buf, "Load from file %s - %s\n", parsep, (const char *)(read_file_into_db( parsep ) ?"Succeeded":"Failed") ); break; case CA_SLEEP : /* sleep. known as wait for timeout */ cmp = add_wait_job( cmp, WA_TILL, parsep ); break; case CA_SIGNAL : /* signal something */ wl = match_a_string( parsep, signal_comms, &parsep ); if (wl) { sprintf( cmp->rep_buf, "Applying signal %s to %s\n", signal_name(wl), parsep ); signal_things( wl, parsep ); parse_this_line( parsep, level_signal, program_signal, 0 ); }else { sprintf( cmp->rep_buf, "Unknown signal - %s\n", parsep ); } break; case CA_RESTART : /* restart logfiles */ sprintf( cmp->rep_buf, "Restarting output files\n" ); reply_control_msg( cmp ); cmp=NULL; restart_files( ); fprintfile( stderr, "Output files restarted\n" ); break; case CA_RUN : /* sleep. known as wait for timeout */ if ( run_this_batch( parsep )) { sprintf( cmp->rep_buf, "Batch started - %s\n", parsep ); }else { sprintf( cmp->rep_buf, "Unknown batch - %s\n", parsep ); } break; case CA_BOUNCE : /* signal something */ sprintf( cmp->rep_buf, "Bouncing - %s\n", parsep ); parse_this_line( parsep, level_bounce, program_bounce, 0 ); break; case CA_CLEANUP : /* perform shm cleanup */ do_cleanup( comms_hint, 1 ); sprintf( cmp->rep_buf, "Cleanup started\n" ); break; case CA_DEBUG : /* Do something debuggy */ watchdog_debug( parsep, cmp->rep_buf ); break; default: /* you what ?! */ sprintf( cmp->rep_buf, "Unrecognised command\n" ); break; } if (dupep) free( dupep ); if (cmp) { reply_control_msg( cmp ); } } return(rv); } /* -- End of File -- */