#!/usr/bin/perl
#
#   PS-Farm daemon, near copy of golem daemon
#   (simple cron implementation with task dependencies)
#
#   kohts@yandex-team.ru
#
#

BEGIN {
  use Cwd;
  use FindBin;

  # without this chroot abs_path (below)
  # returns empty string if daemon is called
  # from some shell script and current directory
  # is a home directory of a user (permissions?)
  chroot('/');

  $ENV{'MY_BIN'} = "$FindBin::Bin";
  $ENV{'MY_LIB'} = Cwd::abs_path("$ENV{'MY_BIN'}/../lib");
  if (! -d $ENV{'MY_LIB'}) {
    $ENV{'MY_LIB'} = Cwd::abs_path("$ENV{'MY_BIN'}/lib");
  }

  $ENV{'MY_ETC'} = Cwd::abs_path("$ENV{'MY_BIN'}/../etc");
  $ENV{'MY_ROOT'} = Cwd::abs_path("$ENV{'MY_BIN'}/../../../..");

  if (!$ENV{'PS_SNAKED_LIB'}) {
    $ENV{'PS_SNAKED_LIB'} = $ENV{'MY_LIB'};
  }

  my $cfg_next = 0;
  foreach my $arg (@ARGV) {
    if ($arg eq '--cfg') {
      $cfg_next = 1;
      next;
    }
    if ($cfg_next) {
      $ENV{'PS_SNAKED_CFG'} = $arg;
      last;
    }
  }

  unless ($ENV{'PS_SNAKED_CFG'}) {
    $ENV{'PS_SNAKED_CFG'} = $ENV{'MY_ETC'};

    if (! -d $ENV{'PS_SNAKED_CFG'}) {
      $ENV{'PS_SNAKED_CFG'} = ($ENV{'MY_ROOT'} eq "/" ? "" : $ENV{'MY_ROOT'}) .
        "/etc/ps-farm/options/ps-snaked";
    }
    if (! -d $ENV{'PS_SNAKED_CFG'} && -d "/etc/ps-farm/options/ps-snaked") {
      $ENV{'PS_SNAKED_CFG'} = "/etc/ps-farm/options/ps-snaked";
    }
    if (! -d $ENV{'PS_SNAKED_CFG'} && -d "/etc/snaked") {
      $ENV{'PS_SNAKED_CFG'} = "/etc/snaked";
    }
  }

  die "$ENV{'PS_SNAKED_CFG'} is not available. Unable to continue\n\n"
    unless -d "$ENV{'PS_SNAKED_CFG'}";
};

use strict;
use warnings;

use vars qw($VERSION);
$VERSION = '0.02';

use lib "$ENV{'MY_LIB'}";
use lib "$ENV{'PS_SNAKED_LIB'}";
use snaked;

package psProcess;
sub pid { my ($self) = @_; return $self->{'pid'}; }
sub ppid { my ($self) = @_; return $self->{'ppid'}; }
sub pgrp { my ($self) = @_; return $self->{'pgid'}; }
sub pgid { my ($self) = @_; return $self->{'pgid'}; }
sub cmndline { my ($self) = @_; return $self->{'cmd'}; }

package psSnake::Daemon;
use Storable;
use Time::HiRes qw/usleep/;
use Schedule::Cron::Events;
use Time::Local;
use POSIX;
use IO::Handle; # autoflush
use Socket; # socketpair
use Fcntl;

if ($^O ne 'linux') {
  require Proc::ProcessTable;
}

my $version = '($Id: snaked 3935 2010-06-22 19:22:02Z kohts $)';

my $daemon_match_cfg = qr/^([^\s]+perl[^\s]*[\s]+|)[^\s]+(ps-)?snaked.+(daemon|debug).+cfg.+$ENV{'PS_SNAKED_CFG'}/;
my $daemon_match_cfg1 = qr/^([^\s]+perl[^\s]*[\s]+|)[^\s]+(ps-)?snaked.+cfg.+$ENV{'PS_SNAKED_CFG'}.+(daemon|debug)/;
my $daemon_match_nocfg = qr/^([^\s]+perl[^\s]*[\s]+|)[^\s]+(ps-)?snaked.+(daemon|debug)/;
my $watchdog_match = qr/^([^\s]+perl[^\s]*[\s]+|)[^\s]+(ps-)?snaked.+(watchdog).+cfg.+$ENV{'PS_SNAKED_CFG'}/;
my $watchdog_match1 = qr/^([^\s]+perl[^\s]*[\s]+|)[^\s]+(ps-)?snaked.+cfg.+$ENV{'PS_SNAKED_CFG'}.+(watchdog)/;

my $my_path;
my $my_command_line;
my $watchdogs2maintain = 1;

$psSnake::Daemon::runtime = {
  "type" => "master",

  "flags" => {
    "stop" => 0,
    "refresh_configuration" => 0,
    },

  "children" => {},
  "children_cache" => {},

  "usec_2check_watchdog" => 0,
  "usec_2refresh_configuration" => 0,
  
  "start_time" => time(),

  "current_tasks" => {},

  "tasks" => {},
  "task_groups" => {},

  "config" => {},

  };

sub sigTERM_handler {
#  psSnake::do_log("snaked $$ term");
  if ($psSnake::Daemon::runtime->{'type'} eq 'master') {
    $psSnake::Daemon::runtime->{'flags'}->{'stop'} = 1;
  }
  elsif ($psSnake::Daemon::runtime->{'type'} eq 'watchdog') {
    exit;
  }
}
sub sigHUP_handler {
  if ($psSnake::Daemon::runtime->{'type'} eq 'master') {
    $psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration'} = 1;
    $psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration_logged'} = 0;
  }
}
sub sigUSR1_handler {
}
sub sigUSR2_handler {
  if ($psSnake::Daemon::runtime->{'type'} eq 'master') {
    # do not restart if alreadying being stopped
    if (!$psSnake::Daemon::runtime->{'flags'}->{'stop'}) {
      $psSnake::Daemon::runtime->{'flags'}->{'restart'} = 1;
    }
  }
}

$SIG{'TERM'} = \&sigTERM_handler;
$SIG{'HUP'} = \&sigHUP_handler;
$SIG{'USR1'} = \&sigUSR1_handler;
$SIG{'USR2'} = \&sigUSR2_handler;

sub help() {
  print '
    snaked -- cron as it should be
    ' . $version . '

    command-line options:
      
      start-up type:
        --daemon    -- run in background
        --debug     -- run in foreground with debug output

      runtime control:
        --restart   -- schedule restart for currently running daemon
                       (valid only for backgrounded daemon)
        --configure -- schedule reread of configuration
        --status    -- is there daemon running?
        --stop      -- schedule stop for currently running daemon

      configuration:
        --show-jobs   -- show configured daemon jobs
        --version     -- show daemon version

';
  exit 0;
}

sub config_value {
  my ($option_name) = @_;

  my $config = $psSnake::Daemon::runtime->{'config'};
  if ($config->{$option_name}) {
    return $config->{$option_name}->{'value'};
  }
  
  return undef;
}

sub do_err_log {
  my ($msg) = @_;

  my $config = $psSnake::Daemon::runtime->{'config'};

  if (config_value('log_errors')) {
    my $tmp_log = $psSnake::LOG->{'filename'};
    $psSnake::LOG->{'filename'} = config_value('log_errors');
    
    # just in case it fails
    my $res = eval {
      psSnake::do_log($msg);
    };

    $psSnake::LOG->{'filename'} = $tmp_log;
  }
}

sub run_task {
  my ($task_name) = @_;
  
  my $task = $psSnake::Daemon::runtime->{'tasks'}->{$task_name};

  my $old_job_name;
  $old_job_name = $ENV{'JOB_NAME'} if defined($ENV{'JOB_NAME'});
  $ENV{'JOB_NAME'} = $task_name;

  psSnake::debug("running task [$task_name] timeout [$task->{'execution_timeout'}] kill timeout [$task->{'kill_timeout'}]");
  my $o = psSnake::run_forked($task->{'cmd'}, {
    'timeout' => $task->{'execution_timeout'},
    'terminate_on_parent_sudden_death' => 1,
    'terminate_on_signal' => 'TERM',
    'terminate_wait_time' => $task->{'kill_timeout'},
    'clean_up_children' => 1,
    });
  psSnake::debug("finished [$task_name]: " . psSnake::safe_string($o->{'exit_code'}));

  $ENV{'JOB_NAME'} = $old_job_name if $old_job_name;

  if ($o->{'parent_died'}) {
    do_err_log("[$$] my parent died, exiting");
    psSnake::do_log("[$$] my parent died, exiting");
    exit 1;
  }
  elsif ($o->{'err_msg'}) {
    if (! defined($task->{'disable_notifications'})) {
      # save first failure time (this is only valid during
      # child life, parent will set this again in its
      # memory space after child returns $o->{'err_msg'})
      $task->{'runtime'}->{'first_failure_time'} = time()
        unless $task->{'runtime'}->{'first_failure_time'};

      # do not notify more often than once
      # each $task->{'notification_interval'} seconds
      # (notify after each failure if not defined)
      if ($task->{'notification_interval'}) {
        if ($task->{'runtime'}->{'first_failure_time'} + $task->{'notification_interval'} < time()) {
          psSnake::send_mail({
            'to' => ($task->{'admin_email'} ? $task->{'admin_email'} : $psSnake::ADMIN_EMAIL),
            'subject' => $0 . ": $task_name warning",
            'body' => $o->{'err_msg'},
            'no_cc_all' => 1,
            });
          
          # pretend that everything went fine
          # (this will make parent reset first_failure_time)
          return "";
        }
      }
      else {
        psSnake::send_mail({
          'to' => ($task->{'admin_email'} ? $task->{'admin_email'} : $psSnake::ADMIN_EMAIL),
          'subject' => $0 . ": $task_name warning",
          'body' => $o->{'err_msg'},
          'no_cc_all' => 1,
          });
      
        # if we have no notification interval
        # do not mask errors from parent --
        # so it can log them (!)
        #
      }
    }

    # if notification was not sent --
    # let parent know that we had problem
    # and it should set first_failure_time
    # (if not set)
    return $o->{'err_msg'};
  }

  return "";
}

sub add_child {
  my ($type, $id, $opts) = @_;

  $opts = {} unless $opts;

  psSnake::die ("Programmer error: add_child expects at least child type")
    unless $type;

  my $child_socket;
  my $parent_socket;

  socketpair($child_socket, $parent_socket, AF_UNIX, SOCK_STREAM, PF_UNSPEC) ||
    psSnake::die ("socketpair: $!");

  $child_socket->autoflush(1);
  $parent_socket->autoflush(1);

  my $pid;

  if ($pid = fork) {
    # we are a parent
    close $parent_socket;

    my $flags = 0;
    fcntl($child_socket, F_GETFL, $flags) || die "can't fnctl F_GETFL: $!";
    $flags |= O_NONBLOCK;
    fcntl($child_socket, F_SETFL, $flags) || die "can't fnctl F_SETFL: $!";

    my $child = {
      'pid' => $pid,
      'type' => $type,
      'borntime' => time(),
      'killtime' => 0,
      'id' => $id,
      'child_socket' => $child_socket,
      'output' => '',
      };

    $psSnake::Daemon::runtime->{'children'}->{'by_pid'}->{$pid} = $child;
    $psSnake::Daemon::runtime->{'children'}->{'by_type'}->{$type}->{$pid} = $child;

    child_started($type);
  }
  else {
    psSnake::die("cannot fork: $!") unless defined $pid;

    $psSnake::Daemon::runtime->{'type'} = "child";
    close $child_socket;

    my $r = run_task($type);

    while ($r =~ /([^\r\n]+?)([\r\n]|$)/sg) {
      my $s = $1;
      my $e = $2;

      print $parent_socket "$s\n";
    }

    close($parent_socket);
    exit 0;
  }
}

sub get_child_cache {
  my ($name) = @_;

  if (!defined($psSnake::Daemon::runtime->{'children_cache'}->{$name})) {
    $psSnake::Daemon::runtime->{'children_cache'}->{$name} = {
      'laststart' => 0
      };
  }
  return $psSnake::Daemon::runtime->{'children_cache'}->{$name};
}

sub find_child {
  my ($id, $type) = @_;

  foreach my $v (values %{$psSnake::Daemon::runtime->{'children'}->{'by_pid'}}) {
    
    if ($id) {
      # search for child by its id (and type if specified)

      if ($v->{'id'} && $v->{'id'} eq $id &&
        ($type && $v->{'type'} eq $type || !$type)
        ) {

        return $v;
      }
    }
    elsif ($type) {
      # returns first child of specified type
      if ($v->{'type'} eq $type) {
        return $v;
      }
    }
  }
  
  return undef;
}

sub child_started {
  my ($name) = @_;
  $psSnake::Daemon::runtime->{'children_cache'}->{$name}->{'laststart'} = time();
}

sub child_finished {
  my ($name, $output) = @_;
  $psSnake::Daemon::runtime->{'children_cache'}->{$name}->{'lastfinish'} = time();

  if ($output) {
    do_err_log("[$name]: $output");
    psSnake::do_log("[$name]: $output");
  }
}

# reads output from child if any
# (so it can't overflow IPC buffer)
#
sub manage_child {
  my ($pid) = @_;

  my $child = $psSnake::Daemon::runtime->{'children'}->{'by_pid'}->{$pid};

  my $child_socket = $child->{'child_socket'};
  my $child_output = "";
  while (my $l = <$child_socket>) {
    $child_output .= $l;
  }
  
  $child->{'output'} .= $child_output;
}

sub remove_child {
  my ($pid) = @_;

  psSnake::die("Programmer error: remove_child called on child which hasn't finished yet")
    if waitpid($pid,WNOHANG) ne -1;

  my $child = $psSnake::Daemon::runtime->{'children'}->{'by_pid'}->{$pid};
  my $task = $psSnake::Daemon::runtime->{'tasks'}->{$child->{'type'}};

  close($child->{'child_socket'});

  # if child has output -- then it had some situation
  # which requires user invervention; save failure time
  #
  # (unset when child returns nothing --
  # meaning intervention is no longer needed)
  #
  if ($child->{'output'}) {
    $task->{'runtime'}->{'first_failure_time'} = time()
      unless $task->{'runtime'}->{'first_failure_time'};

    psSnake::debug("child output: " . $child->{'output'});

    if (! defined($task->{'disable_notifications'})) {
      # reset failure interval counter,
      # so we do not send notifications
      # more ofthen than notification_interval
      if ($task->{'runtime'}->{'first_failure_time'} + $task->{'notification_interval'} < time()) {
        $task->{'runtime'}->{'first_failure_time'} = time();
      }
    }
  }
  else {
    # delete first error time so next failure time will be saved
    delete($task->{'runtime'}->{'first_failure_time'});
  }

  child_finished($child->{'type'}, $child->{'output'});

  my $child_group = find_group_by_task(undef, $child->{'type'});
  delete $psSnake::Daemon::runtime->{'current_tasks'}->{$child_group}->{$child->{'type'}};

  delete $psSnake::Daemon::runtime->{'children'}->{'by_type'}->{$child->{'type'}}->{$pid};
  delete $psSnake::Daemon::runtime->{'children'}->{'by_pid'}->{$pid};
}

sub have_children {
  my $have_children = 0;
  foreach my $k (keys %{$psSnake::Daemon::runtime->{'children'}->{'by_pid'}}) {
    $have_children = 1;
    last;
  }
  return $have_children;
}

sub for_each_child {
  my ($opts) = @_;

  $opts = {} unless $opts;
  foreach my $k (keys %{$psSnake::Daemon::runtime->{'children'}->{'by_pid'}}) {
    if ($opts->{'stop_now'}) {
#      psSnake::do_log("killing $k");
      kill(15, $k); # TERM (default for run_forked)
    }
  }
}

sub find_group_by_task {
  my ($task_groups, $task_name) = @_;
  $task_groups = $psSnake::Daemon::runtime->{'task_groups'} unless $task_groups;

  foreach my $tg (keys %{$task_groups}) {
    if ($task_groups->{$tg}->{$task_name}) {
      return $tg;
    }
  }

  return undef;
}

# break $psSnake::Daemon::Runtime->{'tasks'}
# (configured in /etc/ps-farm/options/ps-snaked/jobs)
# into task groups by dependency
#
sub prepare_task_groups {
  my $task_groups = $psSnake::Daemon::runtime->{'task_groups'};
  my $tasks = $psSnake::Daemon::runtime->{'tasks'};

  foreach my $task_name (keys %{$tasks}) {
    my $task = $tasks->{$task_name};
    
    my $attach_task_to_group = sub {
      my ($group_name, $task_name) = @_;

      foreach my $tg (keys %{$task_groups}) {
        if ($tg eq $group_name) {
          $task_groups->{$tg}->{$task_name} = 1;
        }
        elsif ($task_groups->{$tg}->{$task_name}) {
          delete ($task_groups->{$tg}->{$task_name});
        }
      }
    };

    # prepare task groups by dependencies

    # attach task to the group where
    # its conflicting tasks are
    #
    if ($task->{'conflicts'}) {
      my $conflicting_groups = {};

      foreach my $c_task (@{$task->{'conflicts'}}) {
        my $conflicting_tg = find_group_by_task($task_groups, $c_task);

        if ($conflicting_tg) {
          $conflicting_groups->{$conflicting_tg} = 1;
        }
      }

      if (scalar(keys %{$conflicting_groups}) > 1) {
        # found conflicting tasks in different task groups,
        # merging those groups into one

        # choose any
        my $dest_group;
        foreach (keys %{$conflicting_groups}) {
          $dest_group = $_;
          last;
        }

        # and attach all tasks from all the conflicting groups
        # to the destination group
        foreach my $tg (keys %{$conflicting_groups}) {
          foreach my $tt (keys %{$task_groups->{$tg}}) {
            $task_groups->{$dest_group}->{$tt} = 1;
          }

          if ($tg && $tg ne $dest_group) {
            delete($task_groups->{$tg});
          }
        }
      }
      elsif (scalar(keys %{$conflicting_groups}) eq 1) {
        # found conflicting tasks in one group,
        # attaching the task to this group

        my $dest_group;
        foreach (keys %{$conflicting_groups}) {
          $dest_group = $_;
          last;
        }

        $attach_task_to_group->($dest_group, $task_name);
      }
      else {
        # conflicting tasks are not in any group,
        # should attach them to the group where
        # this task will be
        
        my $tg = find_group_by_task($task_groups, $task_name);
        
        # current task is not in any group --
        # creating new group with its name
        if (!$tg) {
          $tg = $task_name;
          $task_groups->{$tg} = {};
          
          $attach_task_to_group->($tg, $task_name);
        }

        foreach my $ctg (@{$task->{'conflicts'}}) {
          $attach_task_to_group->($tg, $ctg);
        }
      }
    }

    if (! find_group_by_task($task_groups, $task_name)) {
      $task_groups->{$task_name} = {};
      $attach_task_to_group->($task_name, $task_name);
    }
  }
}

sub refreshOptions {
  my ($dir, $opts) = @_;
  
  $opts = {} unless $opts;

  my $config = $psSnake::Daemon::runtime->{'config'};
  my $tasks = Storable::dclone($psSnake::Daemon::runtime->{'tasks'});
  my $tmp;

  # read daemon options
  my $new_options = {};

  $tmp = psSnake::read_dir($dir, {'output_type' => 'arrayref', 'only-files' => 1});
  foreach my $o (@{$tmp}) {
    next if $o =~ /^\./o;

    my $fileinfo = psSnake::fileinfo_struct({'absolute_name' => $dir . "/" . $o});

    $new_options->{$o} = 1;

    # option was not modified since we've read it
    if ($config->{$o} && $config->{$o}->{'mtime'} eq $fileinfo->{'mtime'}) {
      next;
    }

    my $option_updated = ($config->{$o} ? 1 : 0);

    $config->{$o}->{'mtime'} = $fileinfo->{'mtime'};
    $config->{$o}->{'value'} = psSnake::read_file_option($dir . "/" . $o);

    if ($option_updated) {
      psSnake::do_log("new value for option $o: " . $config->{$o}->{'value'});
    }

    if ($o eq "admin_email") {
      $psSnake::ADMIN_EMAIL = $config->{$o}->{'value'};
    }
  }

  # remove old options
  foreach my $opt_name (keys %{$config}) {
    next if $new_options->{$opt_name};
    
    delete ($config->{$opt_name});
    psSnake::do_log("option $opt_name removed");
  }

  if (!$psSnake::ADMIN_EMAIL) {
    $psSnake::ADMIN_EMAIL = 'root';
  }

  # configure logging (defaults to /tmp/ps-snaked.log, three 10MB files, rotated)
  #
  if ($config->{'log'}) {
    $psSnake::LOG = {
      'filename' => $config->{'log'}->{'value'},
      };
  }
  else {
    $psSnake::LOG = {
      'filename' => ($ENV{'MY_ROOT'} eq "/" ? "" : $ENV{'MY_ROOT'}) . "/tmp/snaked.log",
      };
  }
  
  if ($config->{'log_rotate_size'}) {
    $psSnake::LOG->{'rotate_size'} = $config->{'log_rotate_size'}->{'value'};
  }
  else {
    $psSnake::LOG->{'rotate_size'} = 1024 * 1024 * 10;
  }
  if ($config->{'log_rotate_keep_copies'}) {
    $psSnake::LOG->{'rotate_keep_copies'} = $config->{'log_rotate_keep_copies'}->{'value'};
  }
  else {
    $psSnake::LOG->{'rotate_keep_copies'} = 2;
  }

  # in watchdog mode we don't need
  # to read job definitions
  return if $opts->{'no-jobs'};


  my $defined_jobs = {};

  # read daemon jobs
  $tmp = psSnake::read_dir($dir . "/jobs", {'output_type' => 'arrayref', 'only-directories' => 1});
  foreach my $o (@{$tmp}) {
    next if $o =~ /^\./o;

    $defined_jobs->{$o} = 1;

    my $dirinfo = psSnake::fileinfo_struct({'absolute_name' => $dir . "/jobs/" . $o});

    # job was not modified since we've read it
    if ($tasks->{$o} && $tasks->{$o}->{'mtime'} eq $dirinfo->{'mtime'}) {
      next;
    }

    if ($tasks->{$o}) {
      psSnake::do_log("reread job [$o] from disk");
    }

    $tasks->{$o}->{'mtime'} = $dirinfo->{'mtime'};

    my $joptions = psSnake::read_dir($dir . "/jobs/" . $o, {'output_type' => 'arrayref', 'only-files' => 1});
    foreach my $jo (@{$joptions}) {
      if ($jo eq 'conflicts') {
        $tasks->{$o}->{$jo} = psSnake::read_file_array($dir . "/jobs/" . $o . "/" . $jo);
      }
      elsif ($jo eq 'cmd') {
        $tasks->{$o}->{$jo} = $dir . "/jobs/" . $o . "/" . $jo;
      }
      else {
        $tasks->{$o}->{$jo} = psSnake::read_file_option($dir . "/jobs/" . $o . "/" . $jo);
      }
    }
    if (defined($tasks->{$o}->{'disabled'}) && !$opts->{'keep_disabled'}) {
      delete($tasks->{$o});
    }
  }
  
  # cleanup removed jobs, validate tasks
  TASKS: foreach my $task_name (keys %{$tasks}) {
    if (!$defined_jobs->{$task_name}) {
      psSnake::do_log("job [$task_name] removed");
      delete($tasks->{$task_name});
      next TASKS;
    }

    my $task = $tasks->{$task_name};

    if (!defined($task->{'execution_timeout'}) || !int($task->{'execution_timeout'})) {
      $task->{'execution_timeout'} = 0;
    }
    if (!defined($task->{'kill_timeout'}) ||
      !int($task->{'kill_timeout'}) && $task->{'kill_timeout'} ne 0) {
      $task->{'kill_timeout'} = 60;
    }

    foreach my $mp ("cmd") {
      if (!$task->{$mp}) {
        psSnake::do_log("skipping job [$task_name]: mandatory parameter [$mp] not specified");
        delete($tasks->{$task_name});
        next TASKS;
      }
    }
    if (! -x $task->{'cmd'}) {
      psSnake::do_log("skipping job [$task_name]: [$task->{'cmd'}] is not executable");
      delete($tasks->{$task_name});
      next TASKS;
    }

    if ((!$task->{'execution_interval'} && !$task->{'execution_schedule'}) ||
      ($task->{'execution_interval'} && $task->{'execution_schedule'})) {
      
      psSnake::do_log("skipping job [$task_name]: one and only one of (execution_interval, execution_schedule) must be defined");
      delete($tasks->{$task_name});
      next TASKS;
    }

    if ($task->{'execution_schedule'}) {
      my $cron;
      eval {
        $cron = new Schedule::Cron::Events($task->{'execution_schedule'}, Seconds => time());
      };

      if (!$cron) {
        my $msg = $@;
        # leave only first line
        $msg =~ s/[\r\n].+$//sgo;
        # remove filename in which the error was raised
        $msg =~ s/at\ \/.+$//sgo;
        $msg = ": $msg" if $msg;

        psSnake::do_log("skipping job [$task_name]: invalid execution_schedule $msg");
        delete($tasks->{$task_name});
        next TASKS;
      }
      $task->{'cron'} = $cron;
      $task->{'next_run'} = timelocal($task->{'cron'}->nextEvent);
    }

    foreach my $dp ("execution_interval", "execution_timeout", "notification_interval", "start_random_sleep") {
      if ($task->{$dp} && !psSnake::is_digital($task->{$dp})) {
        psSnake::do_log("skipping job [$task_name]: [$dp] must be numeric");
        delete($tasks->{$task_name});
        next TASKS;
      }
    }

    if ($task->{'conflicts'} && ref($task->{'conflicts'}) ne 'ARRAY') {
      psSnake::do_log("skipping job [$task_name]: [conflicts] must be an array reference");
      delete($tasks->{$task_name});
      next TASKS;
    }
    if ($task->{'conflicts'}) {
      foreach my $c_task (@{$task->{'conflicts'}}) {
        if ($c_task eq $task_name) {
          psSnake::do_log("skipping job [$task_name]: task conflicts with itself.");
          delete($tasks->{$task_name});
          next TASKS;
        }
      }
    }
  }

  # apply new tasks, recalculate task groups
  $psSnake::Daemon::runtime->{'tasks'} = $tasks;
  $psSnake::Daemon::runtime->{'task_groups'} = {};
  prepare_task_groups();
}

sub code_may_fail {
  my ($code, $opts) = @_;

  die("Need coderef (something to execute)")
    unless $code && ref($code) eq 'CODE';
  
  $opts = {} unless $opts;
  $opts->{'tries'} = 1 unless $opts->{'tries'};
  $opts->{'sleep_between_tries'} = 1 unless $opts->{'sleep_between_tries'};

  my $i = 0;
  my $lastwarn = "";
  my $code_result;

  while ($i < $opts->{'tries'}) {
    $i = $i + 1;

    $SIG{'__WARN__'} = sub { $lastwarn = join("\n", @_); };
    eval { $code_result = $code->(); };
    delete($SIG{'__WARN__'});

    if ($lastwarn && $i < $opts->{'tries'}) {
      $lastwarn = "";
      sleep $opts->{'sleep_between_tries'};
    }
  }

  return {
    'result' => $code_result,
    'warn' => $lastwarn,
    'try' => $i,
    };
}

sub get_process_table {
  my $ptable;

  # Proc::ProcessTable has some leaks on linux
  # which leads to process dying
  if ($^O eq 'linux') {
    $ptable = [];
    
    my $i = 0;

    my $dummy;
    my $open_res;
    while (!($open_res = opendir($dummy, "/proc")) && $i < 3) {
      sleep 1;
      $i++;
    }
    if (!$open_res) {
      psSnake::die("unable to read /proc");
    }

    my @all_entries;
    $i = 0;
    while (scalar(@all_entries) < 3 && $i < 3) {
      @all_entries = readdir($dummy);
      sleep 1 if $i > 0;
      $i++;
    }
    close($dummy);

    # . + .. eq 2
    if (scalar(@all_entries) < 3) {
      psSnake::die("/proc is not mounted");
    }

    my $read_may_fail = sub {
      my ($filename) = @_;
      my $filecontent;
      if (open F, $filename) {
        { local $/ = undef; $filecontent = <F>; }
        close F;
      }
      return $filecontent;
    };

    foreach my $e (sort @all_entries) {
      my $pid_dir = "/proc/$e";

      next if $e eq '.' || $e eq '..';
      next if $e !~ /^\d+$/o;
      next if ! -d $pid_dir;
      
      my $cmd = $read_may_fail->("$pid_dir/cmdline");
      $cmd =~ s/\0/ /goi if $cmd;

      my $stat = $read_may_fail->("$pid_dir/stat");
      next unless $stat;

      my @stat_arr = split(" ", $stat);
      next if ! scalar(@stat_arr) > 5;

      if (!$cmd) {
        $cmd = $stat_arr[1];
        
        if ($cmd) {
          $cmd =~ s/[\(\)]//goi;
          $cmd = "[" . $cmd . "]";
        }
      }

      my $ppid = $stat_arr[3];
      my $pgid = $stat_arr[4];

      next if ! $cmd;
      next if ! $ppid =~ /^\d+$/o;
      next if ! $pgid =~ /^\d+$/o;

      my $p = {
        'pid' => $e,
        'ppid' => $ppid,
        'pgid' => $pgid,
        'cmd' => $cmd,
        };

      bless ($p, 'psProcess');
      push (@{$ptable}, $p);
    }

    return $ptable;
  }
  else {
    my $r = code_may_fail(sub {return Proc::ProcessTable->new()->table}, {'tries' => 3});

    if (!$r->{'result'}) {
      psSnake::die("unable to get process table: " . $r->{'warn'});
    }
    
    $ptable = $r->{'result'};
  }

  my $i = 0;
  while (scalar(@{$ptable}) < 2 && $i < 3) {
    $i++;
    sleep 1;
    $ptable = get_process_table();
  }

  if (scalar(@{$ptable}) < 2) {
    psSnake::die("unable to read process table");
  }

  return $ptable;
}

sub get_process_by_id {
  my ($pid, $opts) = @_;

  $opts = {} unless $opts;

  my $processes;
  if ($opts->{'processes'}) {
    $processes = $opts->{'processes'};
  }
  else {
    if (!$psSnake::Daemon::runtime->{'startup_processes'}) {
      $psSnake::Daemon::runtime->{'startup_processes'} = get_process_table();
    }
    $processes = $psSnake::Daemon::runtime->{'startup_processes'};
  }

  foreach my $p (@$processes) {
    my $r = code_may_fail(sub {return $p->pid});

    if (!$r->{'result'}) {
#      print STDERR
#        "empty pid: " . $p->cmndline . "; " .
#        ((-f $p->cmndline) ? "file exists" : "file does not exist") .
#        "; my pid [" . $$ . "]" .
#        "\n";
#
# dvina: empty pid: /proc/23263/cmdline; file does not exist; my pid [23348]
# dunai: empty pid: /proc/31978/cmdline; file does not exist; my pid [32072]

      next;
    }
    
    return $p if $r->{'result'} eq $pid;
  }

  return undef;
}

# get the pid of my parent process (by command line)
# 
sub get_my_process {
  my ($pid) = @_;

  my $orig_pid = $pid;

  # trying to find daemon with the same --cfg option
  #
  while ($pid ne 1) {
    my $pid_p = get_process_by_id($pid);

    if (!$pid_p) {
      psSnake::die("unable to find [$pid] in process list");
    }

    if ($pid_p->cmndline &&
      ($pid_p->cmndline =~ /$daemon_match_cfg/ || $pid_p->cmndline =~ /$daemon_match_cfg1/)
      ) {
      return $pid_p;
    }

    $pid = $pid_p->ppid;
  }

  # backward compatibility: trying to find
  # any daemon without --cfg option
  $pid = $orig_pid;
  while ($pid ne 1) {
    my $pid_p = get_process_by_id($pid);
    if (!$pid_p) {
      psSnake::die("unable to find [$pid] in process list");
    }

    if ($pid_p->cmndline && $pid_p->cmndline =~ /$daemon_match_nocfg/ && $pid_p->cmndline !~ /--cfg/) {
      return $pid_p;
    }

    $pid = $pid_p->ppid;
  }

  return undef;
}

# get pid of other daemon started with the same --cfg option
#
sub get_other_daemon_process {
  my ($opts) = @_;
  $opts = {} unless $opts;

  my $processes;
  if (!$psSnake::Daemon::runtime->{'startup_processes'} || $opts->{'refresh_startup_processes'}) {
    $psSnake::Daemon::runtime->{'startup_processes'} = get_process_table();
  }
  $processes = $psSnake::Daemon::runtime->{'startup_processes'};

  # this doesn't mean "always find my process",
  # name of the sub is not consistent!!!
  #
  # it usually returns undef (during --stop for example)
  #
  my $my_process = get_my_process($$);

  my $r;

  # trying to find other daemon with the same --cfg option
  #
  foreach my $p (@$processes) {
    my $p_pid;
    my $p_cmndline;
    my $p_pgrp;
    $r = code_may_fail(sub {$p_pid = $p->pid});
    $r = code_may_fail(sub {$p_cmndline = $p->cmndline});
    $r = code_may_fail(sub {$p_pgrp = $p->pgrp});

    next unless $p_cmndline;
    next if $p_cmndline !~ /$daemon_match_cfg/ && $p_cmndline !~ /$daemon_match_cfg1/;
    
    # find process with given command line
    # from other process group
    if ($my_process) {
      next if $p->pgrp eq $my_process->pgrp;
    }

    # if we are looking for daemon then its parent should be init
    if ($p_cmndline !~ /--debug/o && $p->ppid ne 1) {
      next;
    }

    my $real_daemon = get_process_by_id($p_pgrp);
    
    # found a process for which group leader doesn't exist
    # (shouldn't happen but just in case of some error)
    # 
    #
    # real world situation:
    #
    # pechora:~# ps -eo pid,ppid,pgrp,cmd | grep snak | grep -v grep
    #  5674     1  5674 /usr/bin/perl /usr/local/ps-snake/bin/snaked --watchdog --cfg /etc/ps-farm/options/ps-snaked
    # 26550     1 25742 /usr/bin/perl /usr/local/ps-snake/bin/snaked --daemon --cfg /etc/ps-farm/options/ps-snaked
    # 29634     1 25742 /usr/bin/perl /usr/local/ps-snake/bin/snaked --daemon --cfg /etc/ps-farm/options/ps-snaked
    #
    # corresponding log message about parent pid:
    # Sat Apr 10 16:37:52 2010 [/usr/local/ps-snake/bin/snaked] [25742] started
    # 
    # both 26550 and 29634 were not snaked daemons
    # but were forks doing some work (actually locked
    # during log operation or something) but
    # watchdog doesn't detect difference between
    # snaked daemon and its children forks
    # (it should and will do it one day probably)
    # 
    # and manual `snaked --daemon` also didn't detect them,
    # now it kills them (their process group)
    # before spawning new daemon
    # 
    if (!$real_daemon) {
      print STDERR "cleaning up stuck process group [$p_pgrp]\n";
      kill(-9, $p_pgrp);
    }
    
    return $real_daemon;
  }

  # backward compatibility: trying to find
  # any other daemon without --cfg option
  #
  foreach my $p (@$processes) {
    my $p_pid;
    my $p_cmndline;
    my $p_pgrp;
    $r = code_may_fail(sub {$p_pid = $p->pid});
    $r = code_may_fail(sub {$p_cmndline = $p->cmndline});
    $r = code_may_fail(sub {$p_pgrp = $p->pgrp});

    next unless $p_cmndline;
    next unless $p_cmndline =~ /$daemon_match_nocfg/;
    next if $p_cmndline =~ /--cfg/;

    if ($my_process) {
      next if $p->pgrp eq $my_process->pgrp;
    }

    my $real_daemon = get_process_by_id($p_pgrp);
    return $real_daemon;
  }

  return undef;
}

sub canonical_command_line {
  my ($cmdline, $path) = @_;

  return "" unless $cmdline && $path;

  # suppress space in the end of command on freebsd
  $cmdline =~ s/\ +$//go;

  # replace path to the executable with full path
  #
  # notes:
  #   - regexp is not global so it replaces only 1st occurrence
  #
  #   - .+? is not greedy so it will find the 1st occurrence of
  #   "ps-snaked" string which should be the name of executable
  # 
  $cmdline =~ s/.+?(ps-)?snaked(\s+|$)/${path}\/snaked /;
  $cmdline =~ s/\s+$//goi;

  return $cmdline;
}

sub get_my_path_commandline {
  my ($opts) = @_;
  my $my_path;
  my $my_command_line;

  my $me = get_process_by_id($$, $opts);
  psSnake::die("[$$]: unable to find myself in process list") unless $me;
  $my_path = $FindBin::Bin;
  psSnake::die("[$$]: unable to find my path") unless $my_path;
  $my_command_line = $me->cmndline;
  psSnake::die("[$$] unable to determine my command line") unless $my_command_line;

  return ($my_path, $my_command_line);
}

sub exec_ps_snaked {
  my ($my_command_line, $my_path) = @_;

  # on ws1-569 in snaked.log got:
  #
  # Mon Oct 19 17:59:17 2009 [/place/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked] unable to exec  --cfg /place/home/monitor/ps-snake/etc/ps-farm/options/ps-snaked
  #
  # which effectively means that $my_command_line was empty
  # after calling canonical_command_line() below
  # (" --cfg ..." was appended to it in the next step)
  #
  # so trying to determine my command line if it's empty
  # (also added check on startup that we've got it)
  #

  # as a workaround for empty command line or path (why?)
  # trying to determine them during exit process
  if (!$my_command_line || !$my_path) {
    ($my_path, $my_command_line) = get_my_path_commandline({'processes' => get_process_table()});
  }

  if (my $pid = fork) {
    # parent
    exit 0;
  }
  else {
    POSIX::setsid() || psSnake::die("Error running setsid: " . $!);

    # let parent exit and clean up from /proc (or whatever)
    sleep 1;

    POSIX::setsid() || psSnake::do_log("Error running setsid: " . $!, {'stderr' => 1}) && die();
    
    $my_command_line = canonical_command_line($my_command_line, $my_path);

    # append --cfg parameter if it's not specified
    # (codepath is used only during first run
    # when path to configuration was specified
    # by environment variable)
    if ($my_command_line !~ /--cfg $ENV{'PS_SNAKED_CFG'}/) {
      $my_command_line .= " --cfg $ENV{'PS_SNAKED_CFG'}";
    }

    
    # set environment variable to specify that we want to cleanup
    # already running snaked processes (this might be workaround
    # for some FreeBSD or Proc::ProcessTable on FreeBSD bug,
    # which caused the following:
    #
    # Thu Jun 24 10:29:31 2010 [/opt/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked] clock moved back from Thu Jun 24 10:29:25 2010 to Thu Jun 24 10:29:24 2010, restarting
    # Thu Jun 24 10:29:38 2010 [/opt/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked] [24836] requested to restart
    # Thu Jun 24 10:29:38 2010 [/opt/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked] [24836] stopped
    # Thu Jun 24 10:29:54 2010 [/opt/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked] [WARN] [29246] snaked is already running: /usr/bin/perl /opt/home/monitor/ps-snake/usr/local/ps-snake/bin/snaked --daemon --cfg /opt/home/monitor/ps-snake/etc/ps-farm/options/ps-snaked  [24836]
    #
    # [monitor@orange64 ~]$ uname -a
    # FreeBSD orange64.yandex.ru 7.2-STABLE FreeBSD 7.2-STABLE #0 r199991M: Mon Feb  8 12:50:25 MSK 2010     root@distillatory.yandex.ru:/place/tmp/mk_pkg.wG1LSf1f/obj/place/GIT-repos/FreeBSD-7-r199991/sys/PRODUCTION  amd64
    #
    # Proc::ProcessTable 0.54
    #
    $ENV{'snaked_cleanup_already_running'} = 1;

    # next life
    exec($my_command_line) || psSnake::do_log("[$$] unable to exec $my_command_line", {'stderr' => 1}) && die();
  }
  exit(255);
}

# spawn additional watchdogs slowly
sub manage_watchdogs {
  my $ptable = get_process_table();

  my $number_of_watchdogs = 0;
  # get the ps-snaked daemon process for which the watchdog is running
  my $my_process = undef;
  foreach my $p (@$ptable) {
    next unless $p->cmndline;
    next if $p->cmndline !~ /$watchdog_match/ && $p->cmndline !~ /$watchdog_match1/;
    $number_of_watchdogs = $number_of_watchdogs + 1;
  }

  if ($number_of_watchdogs < $watchdogs2maintain) {
    my $t_cmdline = $my_command_line;
    $t_cmdline = canonical_command_line($t_cmdline, $my_path);
    $t_cmdline =~ s/\-\-daemon/\-\-watchdog/;
    psSnake::run_forked($t_cmdline);
  }
}

sub stop_watchdogs {
  my $ptable = get_process_table();

  # get the ps-snaked daemon process for which the watchdog is running
  my $my_process = undef;
  foreach my $p (@$ptable) {
    next unless $p->cmndline;
    next if $p->cmndline !~ /$watchdog_match/ && $p->cmndline !~ /$watchdog_match1/;
    
    kill (15, $p->pid);
  }
}

# watchdog mode, starts ps-snaked daemon
# if finds that it's not running
sub run_watchdog {

  # set daemon type to change signal handling slightly
  $psSnake::Daemon::runtime->{'type'} = 'watchdog';

  my $unsuccessful_tries = 0;
  my $life_time = 3600 * (rand($watchdogs2maintain) + 1);

  while(1) {
    # stop watchdogs from time to time to toss
    # their pid numbers (which might affect oom killers),
    # but not in case they detect that main process
    # is not running (and waiting a bit to start it)
    # 
    # watchdogs are restarted by main daemon.
    # 
    if ((time() - $psSnake::Daemon::runtime->{'start_time'}) > $life_time && !$unsuccessful_tries) {
      exit(0);
    }

    if ($psSnake::Daemon::runtime->{'usec_2check_watchdog'} < 1) {
      my $ptable = get_process_table();

      my $currently_running_watchdogs = 0;

      # get the ps-snaked daemon process for which the watchdog is running
      my $my_process = undef;

      foreach my $p (@$ptable) {
        my $p_cmndline;
        my $r = code_may_fail(sub {$p_cmndline = $p->cmndline});

        next unless $p_cmndline;

        if ($p_cmndline =~ /$watchdog_match/ || $p_cmndline =~ /$watchdog_match1/) {
          $currently_running_watchdogs = $currently_running_watchdogs + 1;
        }
        elsif ($p_cmndline =~ /$daemon_match_cfg/ || $p_cmndline =~ /$daemon_match_cfg1/) {
          # at this point any snaked is selected
          # (even that which is starting
          # or running external command)

          my $p_pid;
          my $p_ppid;
          my $p_pgrp;
          $r = code_may_fail(sub {$p_pid = $p->pid});
          $r = code_may_fail(sub {$p_ppid = $p->ppid});
          $r = code_may_fail(sub {$p_pgrp = $p->pgrp});

          next unless $p_pid && $p_ppid && $p_pgrp;

          # real daemon is parented by init and is the process group leader,
          # if its not found -- start it, and it will clean up any
          # stuck child from previous daemon (shouldn't happen because
          # children are strongly attached to the main daemon
          # with use of terminate_on_sudden_parent_death flag of run_forked)
          if ($p_ppid eq 1 && $p_pid eq $p_pgrp) {
            $my_process = $p;
          }
        }
      }

      if ($my_process) {
        $unsuccessful_tries = 0;
      }
      else {
        $unsuccessful_tries = $unsuccessful_tries + 1;
      }

      if ($unsuccessful_tries > 0) {
        if ($unsuccessful_tries < 2) {
          # 4 seconds should be enough to start daemon
          # (if it's not found and began to start -- is restarting),
          # randomize each watchdog so they do not try to start
          # all at the same time
          # 
          sleep(4 + 4 * int(rand($currently_running_watchdogs)));
        }
        else {
          psSnake::do_log("watchdog [$$]: snaked not found (killed?), respawning");
          # replace --watchdog with --daemon
          my $t_cmdline = $my_command_line;
          $t_cmdline =~ s/\-\-watchdog/\-\-daemon/;

          # try to execute daemon instead of watchdog
          # if fork fails (wouldn't succeed probably,
          # but could we try at least?)
          #
          if (defined(my $pid = fork)) {
            if ($pid) {
              my $waitpid;
              
              # exec_ps_snaked forks before actually execing snaked
              # and parent exits immediately (which makes it
              # totally detached from watchdog)
              #
              while ($waitpid ne -1) {
                $waitpid = waitpid($pid, WNOHANG);
                sleep 1;
              }

              # watchdog to continue
              $unsuccessful_tries = 0;
            }
            else {
              # watchdog to become snaked
              # (detached from parent totally)
              exec_ps_snaked($t_cmdline, $my_path);
            }
          }
          else {
            exec_ps_snaked($t_cmdline, $my_path);
          }
        }
      }

      $psSnake::Daemon::runtime->{'usec_2check_watchdog'} = ($watchdogs2maintain + 1) * 2 * 2000000;
    }

    usleep(50000);
    $psSnake::Daemon::runtime->{'usec_2check_watchdog'} = $psSnake::Daemon::runtime->{'usec_2check_watchdog'} - 50000;
  }
  exit (255);
}

psSnake::read_cmdline();

if (defined($psSnake::cmd_opts->{'stop'})) {
  my $d = get_other_daemon_process();
  if ($d) {
    print "requesting " . $d->pid() . " [" . $d->cmndline . "] to stop\n";
    kill (15, $d->pid);
  }
  else {
    print "no snaked daemon found for $ENV{'PS_SNAKED_CFG'}\n";
  }
  exit 0;
}
elsif (defined($psSnake::cmd_opts->{'configure'})) {
  my $d = get_other_daemon_process();
  if ($d) {
    print "requesting " . $d->pid() . " [" . $d->cmndline . "] to refresh configuration\n";
    kill ("HUP", $d->pid)
  }
  else {
    print "no snaked daemon found for $ENV{'PS_SNAKED_CFG'}\n";
  }
  exit 0;
}
elsif (defined($psSnake::cmd_opts->{'restart'})) {
  my $d = get_other_daemon_process();
  if ($d) {
    if (!defined($psSnake::cmd_opts->{'only-errors'})) {
      print "requesting " . $d->pid() . " [" . $d->cmndline . "] to restart\n";
    }
    kill ("USR2", $d->pid)
  }
  else {
    print "no snaked daemon found for $ENV{'PS_SNAKED_CFG'}\n";
  }
  exit 0;
}
elsif (defined($psSnake::cmd_opts->{'status'})) {
  my $d = get_other_daemon_process();
  if ($d) {
    print "snaked is running as pid " . $d->pid . ". command line [" . $d->cmndline . "]\n";
  }
  else {
    print "no daemon running\n";
  }
  exit 0;
}
elsif (defined($psSnake::cmd_opts->{'show-jobs'})) {
  refreshOptions($ENV{'PS_SNAKED_CFG'}, {'keep_disabled' => 1});
  print "    global options\n";
  foreach my $k (sort keys %{$psSnake::Daemon::runtime->{'config'}}) {
    print "      $k: " . $psSnake::Daemon::runtime->{'config'}->{$k}->{'value'} . "\n";
  }

  print "    configured jobs:\n";
  foreach my $job_name (sort keys %{$psSnake::Daemon::runtime->{'tasks'}}) {
    print "      " . $job_name . "\n";
    my $job = $psSnake::Daemon::runtime->{'tasks'}->{$job_name};
    foreach my $o (sort keys %{$job}) {
      print "        $o: ";
      if (ref($job->{$o}) eq 'ARRAY') {
        print join(",", @{$job->{$o}});
      }
      else {
        print $job->{$o};
      }
      print "\n";
    }
  }
  exit 0;
}
elsif (defined($psSnake::cmd_opts->{'version'})) {
  print "$version\n";
  exit 0;
}

my $i_am_watchdog = defined($psSnake::cmd_opts->{'watchdog'});

if (!defined($psSnake::cmd_opts->{'daemon'}) && !$psSnake::debug && !$i_am_watchdog) {
  help();
  exit 0;
}

refreshOptions($ENV{'PS_SNAKED_CFG'}, {'no-jobs' => $i_am_watchdog});
my $t = $psSnake::LOG->{'filename'};
if (!psSnake::can_log()) {
  psSnake::warn("Can not write to log file [$t], check permissions; logging to STDERR");
}
if (config_value('log_errors')) {
  if (!psSnake::can_write(config_value('log_errors'))) {
    psSnake::warn("Can not write to log_errors file [" . config_value('log_errors') .
      "], check permissions.");
    delete($psSnake::Daemon::runtime->{'config'}->{'log_errors'});
  }
}

if (!$i_am_watchdog) {
  my $d = get_other_daemon_process();
  if ($d) {
    if (!$ENV{'snaked_cleanup_already_running'}) {
      psSnake::warn("[$$] snaked is already running: " . $d->cmndline . " ["  . $d->pid . "]");
      exit 1;
    }
    else {
      my $previous_snaked = $d;
      $ENV{'snaked_cleanup_already_running'} = undef;
      kill(-9, $d->pid);
      sleep 3;
      $d = get_other_daemon_process({'refresh_startup_processes' => 1});
      if ($d) {
        psSnake::warn("[$$] snaked is already running: " . $d->cmndline . " ["  . $d->pid . "] and doesn't stop on KILL signal");
        exit 1;
      }
      else {
        psSnake::warn("[$$] killed previously running snaked: " . $previous_snaked->cmndline . " ["  . $previous_snaked->pid . "], continuing to start");
      }
    }
  }
}

($my_path, $my_command_line) = get_my_path_commandline();

psSnake::debug("my_path: $my_path");
psSnake::debug("my_command_line: $my_command_line");

print "starting snaked daemon for $ENV{'PS_SNAKED_CFG'}\n"
  unless $i_am_watchdog;

if (defined($psSnake::cmd_opts->{'daemon'}) || $i_am_watchdog) {
  # restart daemon using its full pathname and config path
  # if it was not started like this (so we could distinguish
  # between daemons by their locations)
  if ($my_command_line !~ /$my_path/ || $my_command_line !~ /--cfg $ENV{'PS_SNAKED_CFG'}/) {
    sigUSR2_handler();
  }

  # daemonize
  chdir ('/') || die "unable chdir to /: $!";
  open(STDIN, "/dev/null") || die("unable to read from /dev/null: $!");
  open(STDOUT, "dev/null") || die("unable to write to /dev/null: $!");
  defined(my $pid = fork) || die "Can't fork: $!";
  exit if $pid;
  POSIX::setsid() || die("Error running setsid: " . $!);
  open(STDERR, "/dev/null") || die("unable to write to /dev/null: $!");

  # run watchdog (except for when snaked
  # would be restarted right after start)
  if ($i_am_watchdog && !$psSnake::Daemon::runtime->{'flags'}->{'restart'}) {
    run_watchdog();
    exit;
  }
}
elsif ($psSnake::debug) {
  # stay in foreground
}

psSnake::do_log("[$$] started");

if ($psSnake::Daemon::runtime->{'config'}->{'pidfile'} &&
  !$psSnake::Daemon::runtime->{'flags'}->{'restart'} &&
  !$i_am_watchdog) {

  if (psSnake::can_write($psSnake::Daemon::runtime->{'config'}->{'pidfile'}->{'value'})) {
    psSnake::write_file_option($psSnake::Daemon::runtime->{'config'}->{'pidfile'}->{'value'}, $$);
  }
}

my $previous_now;
my $current_now;

my $max_job_time = int(config_value('max_job_time'));
$max_job_time = 3600 * 2 unless $max_job_time;

while (1) {
  $previous_now = $current_now;
  $current_now = time();

  # clock moved back -- restarting
  if ($previous_now && $current_now && ($previous_now > $current_now)) {
    psSnake::do_log("clock moved back from " . localtime($previous_now) . " to " . localtime($current_now) . ", restarting");
    sigUSR2_handler();
  }

  if (!$psSnake::Daemon::runtime->{'flags'}->{'restart'}) {
    if ($psSnake::Daemon::runtime->{'usec_2check_watchdog'} < 1) {
      manage_watchdogs() if $watchdogs2maintain;
      $psSnake::Daemon::runtime->{'usec_2check_watchdog'} = ($watchdogs2maintain + 1) * 2 * 2000000;
    }
  }

  my $have_active_children = values %{$psSnake::Daemon::runtime->{'children'}->{'by_pid'}};
  psSnake::debug("active children:") if $have_active_children;

  # check status of all children removing those which finished
  foreach my $v (values %{$psSnake::Daemon::runtime->{'children'}->{'by_pid'}}) {
    
    # minimize time() call a bit
    my $now = time();

    # check for really long running processes
    # and kill them brutally (not very fast
    # if killing doesn't work; blocking io?)
    #
    if (($now - $v->{'borntime'}) > $max_job_time && ($now - $v->{'killtime'}) > 5) {
      # kill first then log, because logging might fail
      # which leads to "die"

      # killing exactly child pid, which is only a "manager"
      # for the task; open3_run which is executed inside the child
      # checks whether manager is alive and terminates if not,
      # so killing manager notifies child that it should stop.
      kill(9, $v->{'pid'});
      $v->{'killtime'} = time();

      do_err_log("killed long running (". ($now - $v->{'borntime'}) .
        " seconds) process [$v->{'pid'}] [$v->{'type'}]", {"stderr" => 1});
      psSnake::do_log("killed long running (". ($now - $v->{'borntime'}) .
        " seconds) process [$v->{'pid'}] [$v->{'type'}]", {"stderr" => 1});
    }

    my $waitpid = waitpid($v->{'pid'}, WNOHANG);
    
    psSnake::debug("\tchild [$v->{'pid'}] [$v->{'type'}] [" . ($v->{'id'} ? $v->{'id'} : "") . "]: $waitpid;".
      " running " . (time() - $v->{'borntime'}) . " seconds");

    manage_child($v->{'pid'});

    if ($waitpid eq -1) {
      remove_child($v->{'pid'});
    }
  }

  if ($psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration'} ||
    $psSnake::Daemon::runtime->{'usec_2refresh_configuration'} < 1) {
    if (!have_children()) {
      if ($psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration'}) {
        psSnake::do_log("requested to reread configuration, rereading");
      }
      refreshOptions($ENV{'PS_SNAKED_CFG'});
      $psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration'} = 0;
      $psSnake::Daemon::runtime->{'usec_2refresh_configuration'} = 1000000 * 60;
    }
    else {
      if ($psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration'} &&
        !$psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration_logged'}) {
        psSnake::do_log("requested to reread configuration, waiting for children to stop");
        $psSnake::Daemon::runtime->{'flags'}->{'refresh_configuration_logged'} = 1;
      }
    }
  }
  if ($psSnake::Daemon::runtime->{'flags'}->{'restart'}) {
    if ($psSnake::debug) {
      psSnake::warn("unable to restart attached daemon");
      $psSnake::Daemon::runtime->{'flags'}->{'restart'} = 0;
    }
    else {
      if (!$psSnake::Daemon::runtime->{'flags'}->{'stop'}) {
        psSnake::do_log("[$$] requested to restart");
        $psSnake::Daemon::runtime->{'flags'}->{'stop'} = 1;
      }
    }
  }

  # do processing if we were not requested to stop
  unless ($psSnake::Daemon::runtime->{'flags'}->{'stop'}) {

    my $task_groups = $psSnake::Daemon::runtime->{'task_groups'};

    # run all the tasks one by one (task groups are concurrent)
    foreach my $tg (keys %{$task_groups}) {
      $psSnake::Daemon::runtime->{'current_tasks'}->{$tg} = {}
        unless $psSnake::Daemon::runtime->{'current_tasks'}->{$tg};

      my $current_tasks = $psSnake::Daemon::runtime->{'current_tasks'}->{$tg};
      my $configured_tasks = $psSnake::Daemon::runtime->{'tasks'};

      if (scalar(keys %{$current_tasks})) {
        # process tasks in this task group
        foreach my $task_name (keys %{$current_tasks}) {
          my $task = $configured_tasks->{$task_name};
          
          # do not start task if it's already running
          next if find_child(undef, $task_name);

          # check if there're tasks runing
          # which block this task
          my $do_not_run = 0;
          foreach my $ctask (keys %{$task_groups->{$tg}}) {
            next if $ctask eq $task_name;

            if (find_child(undef, $ctask)) {
              $do_not_run = 1;
              last;
            }
          }

          next if $do_not_run;

          psSnake::debug("starting [$task_name]");
          add_child($task_name);
        }
      }
      else {
        # we've completed all tasks, reschedule tasks which need to be executed
        foreach my $task_name (keys %{$task_groups->{$tg}}) {
          my $task = $configured_tasks->{$task_name};
          my $child_e = get_child_cache($task_name);
          my $now = time();

          if ($task->{'start_random_sleep'}) {
            if (!$child_e->{'startup_sleep'}) {
              $child_e->{'startup_sleep'} = int(rand($task->{'start_random_sleep'}));
              $child_e->{'startup_sleep_started'} = $now;
              psSnake::debug("task [$task_name] random sleep [$child_e->{'startup_sleep'}]");
            }

            if ($now - $child_e->{'startup_sleep_started'} > $child_e->{'startup_sleep'}) {
              $child_e->{'startup_sleep_finished'} = $now;
              psSnake::debug("task [$task_name] random sleep finished");
            }
          }
          else {
            # random startup sleep not configured for the task
            $child_e->{'startup_sleep_finished'} = $now;
          }

          # schedule only those tasks which:
          #   - finished random startup sleep time (if configured) AND
          #     - were not run for $task->{'execution_interval'} time or
          #     - have there next_run time passed
          #
          if ($child_e->{'startup_sleep_finished'} && (
                $task->{'next_run'} && $task->{'next_run'} <= $now
                  ||
                $task->{'execution_interval'} &&
                  $now - $child_e->{'laststart'} > $task->{'execution_interval'}
              )
            ) {

            if ($task->{'next_run'}) {
              $task->{'next_run'} = timelocal($task->{'cron'}->nextEvent);
            }

            $current_tasks->{$task_name} = $task;
          }
        }
      }
    }
  }
  else {
    # wait for children to exit and exit then
    if (have_children()) {
      for_each_child ({'stop_now' => 1});
      psSnake::debug("waiting for children to exit");
      sleep 1;
    }
    else {
      unlink($psSnake::Daemon::runtime->{'config'}->{'pidfile'}->{'value'})
        if $psSnake::Daemon::runtime->{'config'}->{'pidfile'};
      
      psSnake::do_log("[$$] stopped");

      # do not restart watchdogs on restart as they will try
      # to start snaked if restart fails (which should not happen
      # but happens in 0,02-0,03 % of cases)
      #
      # we may want to send some signal to watchdogs here
      # to notify them about restart so they could extend
      # their waiting cycle a bit
      #
      if ($psSnake::Daemon::runtime->{'flags'}->{'restart'}) {
        exec_ps_snaked($my_command_line, $my_path);
      }
      else {
        stop_watchdogs() if !$i_am_watchdog;
      }
      
      exit 0;
    }
  }

  if ($psSnake::debug) {
    psSnake::debug("-");
    sleep (1);
  }
  else {
    my $usec_to_sleep;
    if ($have_active_children) {
      $usec_to_sleep = 50000;
    }
    else {
      $usec_to_sleep = 500000;
    }

   usleep($usec_to_sleep);
   $psSnake::Daemon::runtime->{'usec_2check_watchdog'} = $psSnake::Daemon::runtime->{'usec_2check_watchdog'} - $usec_to_sleep;
   $psSnake::Daemon::runtime->{'usec_2refresh_configuration'} = $psSnake::Daemon::runtime->{'usec_2refresh_configuration'} - $usec_to_sleep;
  }
}

# yes i know this is the way
# to the world of endless may
exit(255);

__END__

=head1 NAME

snaked - cron as it should be.

=head1 SYNOPSIS

  # import old cron jobs (TO BE IMPLEMENTED)
  snaked --import-crontabs

  # check which jobs are configured
  snaked --show-jobs

  # rock with snake
  snaked --daemon

  # (and do not forget to stop old cron
  # so your jobs are not run twice)

=head1 DESCRIPTION

B<snaked> is a job scheduler, just like cron,
which has several unique features making it
much more flexible and useful than any other
cron implementation.

It is heavily tested on Linux and FreeBSD
but might (and hopefully with your help will)
be run on any Perl + POSIX compliant system.

=head2 limit job execution time

You can choose to configure the maximum limit of time
for each job to finish. If job doesn't finish in time
it is killed. The limit is independently configurable
for each job. Forget about lockf, ps -ef | grep -v grep
and cron jobs being run twice and more times concurrently.

You can also configure the upper limit of execution time
of any job of the given snaked instance. This global limit
is checked independently of the individual
job execution time limits.

=head2 unique job id and job dependencies

Each snaked job has its unique job identifier
which is used to configure job dependencies:
for any job you can specify other jobs
(addressed by their identifiers) which
shouldn't be run with this job concurrently.

So if job A is being executed and time comes
to start job B which is configured as conflicting
with job A, then the start of job B is postponed
until job A is finished.

=head2 more often than once a minute

snaked allows jobs to be run more often than once a minute.

Actually snaked supports two execution schedule formats:
old cron format with not less than a minute time resolution
and snaked job schedule format which specifies how often
the jobs is run in seconds, making it possible to run job
even once a second!

=head1 CONFIGURATION EXAMPLE

snaked configuration is a directory which contains
global instance options (each option in separate file)
and associated job definitions where job definition
is also a directory with each job option
stored in a separate file:

  .
  |-- admin_email
  |-- jobs
  |   |-- every_hour
  |   |   |-- cmd
  |   |   `-- execution_schedule
  |   |-- every_ten_seconds
  |   |   |-- cmd
  |   |   `-- execution_interval
  |   `-- fast_job
  |       |-- cmd
  |       |-- conflicts
  |       `-- execution_interval
  `-- log

Above shown configuration defines admin_email
for the snaked instance (optional, defaults to root)
and log file path (optional, defaults to /tmp/snaked.log):

  testing18:/etc/snaked# cat admin_email
  root
  testing18:/etc/snaked# cat log
  /var/log/snaked/snaked.log

There are three jobs named every_hour, every_ten_seconds
and fast_job. All of them contain cmd file -- an executable
which is run by snaked (this can be any executable
allowed by underlying operating system):

  testing18:/etc/snaked/jobs/every_hour# ls -l cmd
  -rwxr-xr-x 1 root root 0 2010-07-07 00:24 cmd
  testing18:/etc/snaked/jobs/every_hour# file cmd
  cmd: POSIX shell script text executable

First job, every_hour, has a parameter execution_schedule
which is an old cron schedule example (parsed by L<Schedule::Cron::Events>):

  testing18:/etc/snaked/jobs/every_hour# cat execution_schedule
  0 * * * *

Two other jobs use snaked execution_interval schedule,
specifying that every_ten_seconds job should be run
once in ten seconds, and fast_job should be run
once in every second.

  testing18:/etc/snaked/jobs/every_ten_seconds# cat execution_interval
  10
  testing18:/etc/snaked/jobs/fast_job# cat execution_interval
  1

To make it a bit more explanatory we've defined conflicts option
for fast_job which specifies that fast_job should not be run
if every_ten_seconds is running:

  testing18:/etc/snaked/jobs/fast_job# cat conflicts
  every_ten_seconds

Which translates to "try to run 'fast_job' as often as once a second,
but wait if 'every_ten_seconds' job is running".

=head1 DAEMON OPTIONS

=over 4

=item admin_email

Optional. Where to send emails about failing jobs. Defaults to root.

=item log

Optional. Name of the log filename which holds all the log messages
including informational and error messages. Defaults to /tmp/snaked.log.

=item log_errors

Optional. Name of the log filename used only for error messages.
Defaults to nothing, turning off separate error logging.

=item log_rotate_size

Optional. Size of the log file after which it is rotated.
Defaults to 10 MB.

=item log_rotate_keep_copies

Optional. Number of rotated log files to preserve.
Defaults to 2.

=item max_job_time

Optional. Specifies maximum exeuction time limit for all the jobs.
Defaults to 2 hours.

=item pidfile

Optional. Filename of the pidfile where snaked stores
the pid of its main process. Defaults to nothing,
which does not generate any pidfile.

=back

=head1 JOB OPTIONS

=over 4

=item admin_email

Optional. Where toe send emails about failures of this job.
Defaults to global admin_email option (and overrides it). 

=item cmd

Mandatory. Executable with correct file permissions (executable bit on)
which is allowed by underlying operating system. Can be shell script or binary.

=item execution_interval, execution_schedule

Only one parameter, execution_interval or execution_schedule, is allowed
and is mandatory for one job. execution_interval specifies number of seconds
(positive integer) between invocations of cmd. execution_schedule specifies
standard cron format schedule (first five fields) for the job.

=item execution_timeout

Optional. Specifies time limit for the job, in seconds.
Defaults to nothing, turning time limit off.

=item kill_timeout

Optional. Specifies time in seconds between TERM and KILL signals
sent to the job when snaked needs to stop the job (when snaked
stop or restarts, when job runs too long). Defaults to 60 seconds.

=item notification_interval

Optional. Time period in seconds. Job failure emails are not sent
more often than this time period. First email is sent after
first time period of constant failures. This option is used
to suppress emails about accidental job failures. Defaults to 0,
which turns the feature off (delivers email on every job failure).

=item start_random_sleep

Optional. Time period in seconds which specifoes random
first run shift in time for the job. Defaults to 0,
which turns the feature off.

=item conflicts

Optional. Space/line separated list of job identifiers
which should not be run while this job is run. If any
job from this list is currently being executed
then the job owning the option will not be executed.
Defaults to nothing, allowing the job to be run
independently of the status of any other job.

snaked organizes conflicting jobs into job groups,
running every job from the job group one by one
(if the time for the job has come). This guarantees
that every conflicting job is run from time to time
though its start time might be shifted because of
waiting for the conflicting jobs.

=back

=head1 CREDITS

Thank you to Yandex team (in alphabetic order):

  Denis Barov
  Maxim Dementyev
  Eugene Fedotov
  Andrey Grunau
  Andrey Ignatov
  Dmitry Parfenov
  Alexey Simakov
  Julie S Ukhlicheva
  Anton Ustyugov
  Andrey Zonov

for their bug reports, suggestions and contributions.

=head1 AUTHORS

Petya Kohts E<lt>petya@kohts.ruE<gt>

=head1 COPYRIGHT

Copyright 2009 - 2010 Petya Kohts.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut
