run.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #!/usr/bin/env perl
  2. use warnings; #sed replacement for -w perl parameter
  3. # In general, doing
  4. # run.pl some.log a b c is like running the command a b c in
  5. # the bash shell, and putting the standard error and output into some.log.
  6. # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
  7. # run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
  8. # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
  9. # If any of the jobs fails, this script will fail.
  10. # A typical example is:
  11. # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz
  12. # and run.pl will run something like:
  13. # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log
  14. #
  15. # Basically it takes the command-line arguments, quotes them
  16. # as necessary to preserve spaces, and evaluates them with bash.
  17. # In addition it puts the command line at the top of the log, and
  18. # the start and end times of the command at the beginning and end.
  19. # The reason why this is useful is so that we can create a different
  20. # version of this program that uses a queueing system instead.
  21. #use Data::Dumper;
  22. @ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
  23. #print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n";
  24. $job_pick = 'all';
  25. $max_jobs_run = -1;
  26. $jobstart = 1;
  27. $jobend = 1;
  28. $ignored_opts = ""; # These will be ignored.
  29. # First parse an option like JOB=1:4, and any
  30. # options that would normally be given to
  31. # queue.pl, which we will just discard.
  32. for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  33. # allow the JOB=1:n option to be interleaved with the
  34. # options to qsub.
  35. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
  36. # parse any options that would normally go to qsub, but which will be ignored here.
  37. my $switch = shift @ARGV;
  38. if ($switch eq "-V") {
  39. $ignored_opts .= "-V ";
  40. } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
  41. # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
  42. # if the command appears multiple times uses the smallest option.
  43. if ( $max_jobs_run <= 0 ) {
  44. $max_jobs_run = shift @ARGV;
  45. } else {
  46. my $new_constraint = shift @ARGV;
  47. if ( ($new_constraint < $max_jobs_run) ) {
  48. $max_jobs_run = $new_constraint;
  49. }
  50. }
  51. if (! ($max_jobs_run > 0)) {
  52. die "run.pl: invalid option --max-jobs-run $max_jobs_run";
  53. }
  54. } else {
  55. my $argument = shift @ARGV;
  56. if ($argument =~ m/^--/) {
  57. print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
  58. }
  59. if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
  60. $ignored_opts .= "-sync "; # Note: in the
  61. # corresponding code in queue.pl it says instead, just "$sync = 1;".
  62. } elsif ($switch eq "-pe") { # e.g. -pe smp 5
  63. my $argument2 = shift @ARGV;
  64. $ignored_opts .= "$switch $argument $argument2 ";
  65. } elsif ($switch eq "--gpu") {
  66. $using_gpu = $argument;
  67. } elsif ($switch eq "--pick") {
  68. if($argument =~ m/^(all|failed|incomplete)$/) {
  69. $job_pick = $argument;
  70. } else {
  71. print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
  72. }
  73. } else {
  74. # Ignore option.
  75. $ignored_opts .= "$switch $argument ";
  76. }
  77. }
  78. }
  79. if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
  80. $jobname = $1;
  81. $jobstart = $2;
  82. $jobend = $3;
  83. if ($jobstart > $jobend) {
  84. die "run.pl: invalid job range $ARGV[0]";
  85. }
  86. if ($jobstart <= 0) {
  87. die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
  88. }
  89. shift;
  90. } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
  91. $jobname = $1;
  92. $jobstart = $2;
  93. $jobend = $2;
  94. shift;
  95. } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
  96. print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
  97. }
  98. }
  99. # Users found this message confusing so we are removing it.
  100. # if ($ignored_opts ne "") {
  101. # print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
  102. # }
  103. if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
  104. # then work out the number of processors if possible,
  105. # and set it based on that.
  106. $max_jobs_run = 0;
  107. if ($using_gpu) {
  108. if (open(P, "nvidia-smi -L |")) {
  109. $max_jobs_run++ while (<P>);
  110. close(P);
  111. }
  112. if ($max_jobs_run == 0) {
  113. $max_jobs_run = 1;
  114. print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
  115. }
  116. } elsif (open(P, "</proc/cpuinfo")) { # Linux
  117. while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
  118. if ($max_jobs_run == 0) {
  119. print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
  120. $max_jobs_run = 10; # reasonable default.
  121. }
  122. close(P);
  123. } elsif (open(P, "sysctl -a |")) { # BSD/Darwin
  124. while (<P>) {
  125. if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
  126. $max_jobs_run = $1;
  127. last;
  128. }
  129. }
  130. close(P);
  131. if ($max_jobs_run == 0) {
  132. print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
  133. $max_jobs_run = 10; # reasonable default.
  134. }
  135. } else {
  136. # allow at most 32 jobs at once, on non-UNIX systems; change this code
  137. # if you need to change this default.
  138. $max_jobs_run = 32;
  139. }
  140. # The just-computed value of $max_jobs_run is just the number of processors
  141. # (or our best guess); and if it happens that the number of jobs we need to
  142. # run is just slightly above $max_jobs_run, it will make sense to increase
  143. # $max_jobs_run to equal the number of jobs, so we don't have a small number
  144. # of leftover jobs.
  145. $num_jobs = $jobend - $jobstart + 1;
  146. if (!$using_gpu &&
  147. $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
  148. $max_jobs_run = $num_jobs;
  149. }
  150. }
  151. sub pick_or_exit {
  152. # pick_or_exit ( $logfile )
  153. # Invoked before each job is started helps to run jobs selectively.
  154. #
  155. # Given the name of the output logfile decides whether the job must be
  156. # executed (by returning from the subroutine) or not (by terminating the
  157. # process calling exit)
  158. #
  159. # PRE: $job_pick is a global variable set by command line switch --pick
  160. # and indicates which class of jobs must be executed.
  161. #
  162. # 1) If a failed job is not executed the process exit code will indicate
  163. # failure, just as if the task was just executed and failed.
  164. #
  165. # 2) If a task is incomplete it will be executed. Incomplete may be either
  166. # a job whose log file does not contain the accounting notes in the end,
  167. # or a job whose log file does not exist.
  168. #
  169. # 3) If the $job_pick is set to 'all' (default behavior) a task will be
  170. # executed regardless of the result of previous attempts.
  171. #
  172. # This logic could have been implemented in the main execution loop
  173. # but a subroutine to preserve the current level of readability of
  174. # that part of the code.
  175. #
  176. # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
  177. #
  178. if($job_pick eq 'all'){
  179. return; # no need to bother with the previous log
  180. }
  181. open my $fh, "<", $_[0] or return; # job not executed yet
  182. my $log_line;
  183. my $cur_line;
  184. while ($cur_line = <$fh>) {
  185. if( $cur_line =~ m/# Ended \(code .*/ ) {
  186. $log_line = $cur_line;
  187. }
  188. }
  189. close $fh;
  190. if (! defined($log_line)){
  191. return; # incomplete
  192. }
  193. if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
  194. exit(0); # complete
  195. } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
  196. if ($job_pick !~ m/^(failed|all)$/) {
  197. exit(1); # failed but not going to run
  198. } else {
  199. return; # failed
  200. }
  201. } elsif ( $log_line =~ m/.*\S.*/ ) {
  202. return; # incomplete jobs are always run
  203. }
  204. }
  205. $logfile = shift @ARGV;
  206. if (defined $jobname && $logfile !~ m/$jobname/ &&
  207. $jobend > $jobstart) {
  208. print STDERR "run.pl: you are trying to run a parallel job but "
  209. . "you are putting the output into just one log file ($logfile)\n";
  210. exit(1);
  211. }
  212. $cmd = "";
  213. foreach $x (@ARGV) {
  214. if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
  215. elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
  216. else { $cmd .= "\"$x\" "; }
  217. }
  218. #$Data::Dumper::Indent=0;
  219. $ret = 0;
  220. $numfail = 0;
  221. %active_pids=();
  222. use POSIX ":sys_wait_h";
  223. for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  224. if (scalar(keys %active_pids) >= $max_jobs_run) {
  225. # Lets wait for a change in any child's status
  226. # Then we have to work out which child finished
  227. $r = waitpid(-1, 0);
  228. $code = $?;
  229. if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
  230. if ( defined $active_pids{$r} ) {
  231. $jid=$active_pids{$r};
  232. $fail[$jid]=$code;
  233. if ($code !=0) { $numfail++;}
  234. delete $active_pids{$r};
  235. # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n";
  236. } else {
  237. die "run.pl: Cannot find the PID of the child process that just finished.";
  238. }
  239. # In theory we could do a non-blocking waitpid over all jobs running just
  240. # to find out if only one or more jobs finished during the previous waitpid()
  241. # However, we just omit this and will reap the next one in the next pass
  242. # through the for(;;) cycle
  243. }
  244. $childpid = fork();
  245. if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
  246. if ($childpid == 0) { # We're in the child... this branch
  247. # executes the job and returns (possibly with an error status).
  248. if (defined $jobname) {
  249. $cmd =~ s/$jobname/$jobid/g;
  250. $logfile =~ s/$jobname/$jobid/g;
  251. }
  252. # exit if the job does not need to be executed
  253. pick_or_exit( $logfile );
  254. system("mkdir -p `dirname $logfile` 2>/dev/null");
  255. open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
  256. print F "# " . $cmd . "\n";
  257. print F "# Started at " . `date`;
  258. $starttime = `date +'%s'`;
  259. print F "#\n";
  260. close(F);
  261. # Pipe into bash.. make sure we're not using any other shell.
  262. open(B, "|bash") || die "run.pl: Error opening shell command";
  263. print B "( " . $cmd . ") 2>>$logfile >> $logfile";
  264. close(B); # If there was an error, exit status is in $?
  265. $ret = $?;
  266. $lowbits = $ret & 127;
  267. $highbits = $ret >> 8;
  268. if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
  269. else { $return_str = "code $highbits"; }
  270. $endtime = `date +'%s'`;
  271. open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
  272. $enddate = `date`;
  273. chop $enddate;
  274. print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
  275. print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
  276. close(F);
  277. exit($ret == 0 ? 0 : 1);
  278. } else {
  279. $pid[$jobid] = $childpid;
  280. $active_pids{$childpid} = $jobid;
  281. # print STDERR "Queued: " . Dumper(\%active_pids) . "\n";
  282. }
  283. }
  284. # Now we have submitted all the jobs, lets wait until all the jobs finish
  285. foreach $child (keys %active_pids) {
  286. $jobid=$active_pids{$child};
  287. $r = waitpid($pid[$jobid], 0);
  288. $code = $?;
  289. if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
  290. if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
  291. }
  292. # Some sanity checks:
  293. # The $fail array should not contain undefined codes
  294. # The number of non-zeros in that array should be equal to $numfail
  295. # We cannot do foreach() here, as the JOB ids do not start at zero
  296. $failed_jids=0;
  297. for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  298. $job_return = $fail[$jobid];
  299. if (not defined $job_return ) {
  300. # print Dumper(\@fail);
  301. die "run.pl: Sanity check failed: we have indication that some jobs are running " .
  302. "even after we waited for all jobs to finish" ;
  303. }
  304. if ($job_return != 0 ){ $failed_jids++;}
  305. }
  306. if ($failed_jids != $numfail) {
  307. die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
  308. }
  309. if ($numfail > 0) { $ret = 1; }
  310. if ($ret != 0) {
  311. $njobs = $jobend - $jobstart + 1;
  312. if ($njobs == 1) {
  313. if (defined $jobname) {
  314. $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
  315. # that job.
  316. }
  317. print STDERR "run.pl: job failed, log is in $logfile\n";
  318. if ($logfile =~ m/JOB/) {
  319. print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
  320. }
  321. }
  322. else {
  323. $logfile =~ s/$jobname/*/g;
  324. print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
  325. }
  326. }
  327. exit ($ret);