// file: $isip/class/mmedia/Splitter/splt_06.cc // version: $Id: splt_06.cc 10675 2009-07-21 01:13:30Z tm334 $ // // isip include files // #include "Splitter.h" // method: splitJobList // // arguments: // Filename& input_file: (input) input file list // Filename& cpus: (input) machine list name or number // String& command_to_execute: (input) Any command that has to run on N files // String& walltime: (input) The time required to run a sigle job on 1 cpu // String& batch_command: (input) The command used to submit to the cluster // Sdb& sdb_list: (input) This list of all the files // String& cluster_name: (input) This specifies the name of the cluster // // return: logical error status // // this method splits a list of jobs and submits it to the cluster // // read the cpus list if the input is an sof file // bool8 Splitter::splitJobList(Filename& input_file_a, Filename& cpus_a, String& command_to_execute_a, String& walltime_a, String& batch_command_a, Sdb& sdb_list_a, String& cluster_name_a) { // declare local variables // String output; Long cpu_number; String command; String scripts_filename; String out_scripts; Vector log_files; String begin_time; String end_time; Long start_time; Long finish_time; String proc_id_str; char* exec_command_cluster; // check for the number of cpus required, it can be either from a list // file(sof) or just a number // if (!cpus_a.eq(NULL)) { if (File::exists(cpus_a)) { Sdb cpus_db; Sof cpus_sof; if (cpus_sof.open(cpus_a)) { if (cpus_db.read(cpus_sof, (Long)0)) { cpu_number = cpus_db.length(); output.concat(L"The no. of cpus required is "); output.concat(cpu_number); output.concat(L"\n"); } } cpus_sof.close(); } // if the input is not a file then take the commandline input as the // number of cpus // else { output.concat(L"The no. of cpus required is "); String temp1(cpus_a); cpu_number.assign(temp1); output.concat(cpus_a); output.concat(L"\n"); } } else { Error::handle(name(), L"cpus - unable to get number of cpus required, check if the number of processors required is mentioned", Error::ARG, __FILE__, __LINE__); return false; } // read the various commandline options // if (!command_to_execute_a.eq(NULL)) { output.concat(L"\n The command is "); output.concat(command_to_execute_a); } else { Error::handle(name(), L"Command - unable to get the -command_to_execute argument, check if you have mentioned the command", Error::ARG, __FILE__, __LINE__); return false; } if (!walltime_a.eq(NULL)) { output.concat(L"\n The estimated walltime is "); output.concat(walltime_a); } else { walltime_a.assign(L"01:00:00"); } if (!batch_command_a.eq(NULL)) { batch_command_a.toLower(); output.concat(L"\n The batch command is "); output.concat(batch_command_a); } else { batch_command_a.assign(L"qsub"); } // loop over all files and count the number of files to be processed // int32 num_files = 0; sdb_list_a.gotoFirst(); do { // fetch the next file // sdb_list_a.getName(input_file_a); num_files++; } while (sdb_list_a.gotoNext()); // if the number of cpus is greater than the number of id's or files, then // assign the number of id's or files as the cpu_number // if (cpu_number > num_files) { cpu_number.assign(num_files); } // calculate the number of jobs per list // int list_length = 0; list_length = num_files / cpu_number; int reminder_list = 0; reminder_list = num_files % cpu_number; //create intermediate scripts for job submission to the cluster // sdb_list_a.gotoFirst(); // set the vector length for collecting the logfile names, which will be // used during polling and accumulation // log_files.setLength(cpu_number); // get the start time before the jobs have been submitted // start_time.assign(Integral::time()); // get the formated date String for the job start // Integral::cTime(begin_time); // loop through the files for each cpu and submit batch jobs to the cluster // Console::put(L"\nSubmitting jobs:\n"); for (int32 i = 0; i < cpu_number; i++) { // open the scripts file // File scripts_file; // define a persistent scratch space // String tmp; String number; tmp.concat(L"job_isip_run"); tmp.concat(L"_"); String base_name(tmp); // check if the filename already exists // bool8 flag_exists = false; int32 index = 0; while (!flag_exists) { // build a unique string // number.assign(index); // build a new filename // tmp.assign(base_name); tmp.concat(number); tmp.concat(Integral::getPid()); scripts_filename.assign(tmp); // check if all the scripts have been created // if (!File::exists(scripts_filename)) { break; } else { index++; } } // generate script file // Splitter generate_script; generate_script.generateScript(scripts_file,scripts_filename,out_scripts,cluster_name_a,list_length,reminder_list,input_file_a,sdb_list_a,i,command_to_execute_a,walltime_a); // construch the unique log filenames // String tmp2; String log_filename; tmp2.concat(L".job_isip_run"); tmp2.concat(L"_"); String base_name2(tmp2); // build a new filename // tmp2.assign(base_name2); tmp2.concat(i); tmp2.concat(L"_"); tmp2.concat(Integral::getPid()); tmp2.concat(L".log"); log_filename.assign(tmp2); // remove any file that exists with the same name as the log files that // will be generated // if (File::exists(log_filename)) { File::remove(log_filename); } // generate command for job submission // Splitter generate_command; generate_command.generateCommand(log_filename,scripts_filename,command,cluster_name_a); // add the logfilename in a vector inorder to use it for checking for file // creation // log_files(i).assign(log_filename); // convert the wide character string to a character byte // exec_command_cluster = (char*)(byte8*)command; // execute the command // if (system(exec_command_cluster) < 0) { // print debugging information // output.assign(L"error executing command: "); output.concat(command); Console::put(output); return Integral::exit(); } output.assign(L"Job#"); output.concat(i); output.concat(L" submitted to "); output.concat(cluster_name_a); Console::increaseIndention(); Console::put(output); Console::decreaseIndention(); // sleep for some time and then proceed // Integral::sleep(1); // delete the temporary file // File::remove(scripts_filename); } // poll for the log files created by PBS // Splitter poll_log_files; Console::put(L"\nWaiting for the jobs to finish..."); poll_log_files.pollLogFile(log_files); // sleep for some time before opening the log files and accumulating them // Integral::sleep(3); Splitter accumulate_log_files; accumulate_log_files.waitAndAccumulate(log_files, proc_id_str); // get the finish time for the job // finish_time.assign(Integral::time()); // get the time when all jobs have been completed // Integral::cTime(end_time); // send email to user notifying the job completion // Splitter send_email; send_email.sendEmail(start_time,finish_time,begin_time,end_time,proc_id_str); // exit gracefully // return true; } // method: pollLogFile // // arguments: // Vector& log_files: (input) log files list // // return: logical error status // // this method polls for the log files created by PBS // bool8 Splitter::pollLogFile(Vector& log_files_a) { Vector temp_list(log_files_a); Long log_files_length(temp_list.length()); String log_files_name; int32 log_count = 0; // poll every 2 seconds to check for the presence of the logfiles // do { Integral::sleep(2); for (int32 i = 0; i < temp_list.length(); i++) { if (File::exists(temp_list(i))) { temp_list.deleteRange(i, 1); log_count++; if (log_count == log_files_length) { Console::put(L"The submitted jobs have been completed"); break; } } } } while (log_count != log_files_length); // exit gracefully // return true; } // method: waitAndAccumulate // // arguments: // Vector& log_files: (input) log files list // String& proc_id: (input) process id // // return: logical error status // // this method accumulates all the the log files into a single file and removes // all the log files created by PBS. // bool8 Splitter::waitAndAccumulate(Vector& log_files_a, String& proc_id_str_a) { // declare local variables // File accumulated_log_file; String accumulated_log_filename; Long proc_id(Integral::getPid()); String output; // construct the accumulated log filename // proc_id_str_a.assign(proc_id); accumulated_log_filename.assign(L"log_"); accumulated_log_filename.concat(proc_id_str_a); accumulated_log_filename.concat(L".log"); // Open the accumulated log file // if (!accumulated_log_file.open(accumulated_log_filename, File::APPEND_PLUS, File::TEXT)) { output.assign(L"\ncannot open file: "); output.concat(accumulated_log_filename); output.concat(L", bailing out..."); Console::put(output); Integral::exit(); } // print the name of the accumulated log file // output.assign(L"Accumulating the log files and writing the details into ./"); output.concat(accumulated_log_filename); Console::put(output); // loop through all the PBS log files // Long log_files_length(log_files_a.length()); for(int32 i = 0; i < log_files_length; i++) { File tmp; // open the PBS log file // tmp.open(log_files_a(i), File::READ_ONLY, File::TEXT); SysString str; String contents; while(!tmp.eof()) { tmp.get(str); contents.concat(str); contents.concat(L"\n"); } // write the contents into the accumulated log file // accumulated_log_file.put(contents); // close the PBS log file // tmp.close(); } // remove all the PBS logfiles // for(int32 i = 0; i < log_files_length; i++) { // delete the PBS log file // File::remove(log_files_a(i)); } // close the accumulated log file // accumulated_log_file.close(); // exit gracefully // return true; } // method: sendEmail // // arguments: // Long& start_time: (input) start time // Long& finish_time: (input) finish time // String& begin_time: (input) start time in a specific format // String& end_time: (input) end time in a specific format // String& proc_id_str: (input) process id // // return: logical error status // // this method sends an email after all the jobs have finished running on the // cluster. // bool8 Splitter::sendEmail(Long& start_time_a, Long& finish_time_a, String& begin_time_a, String& end_time_a, String& proc_id_str_a) { // declar local variables // String email_content; String sendmail; String output; String date; char* exec_command_sendmail; // build the body of the email // email_content.assign(L"Your job(PID - "); email_content.concat(proc_id_str_a); email_content.concat(L") on the cluster has finished.\n"); email_content.concat(L"The job was started on : "); email_content.concat(begin_time_a); email_content.concat(L"The job ended on : "); email_content.concat(end_time_a); email_content.concat(L"The total time taken for the job is: "); email_content.concat(getDiffTime(start_time_a, finish_time_a)); // print the information n the standard output // Console::put(email_content); // assign the sendmail command to the string // sendmail.assign(L"echo "); sendmail.concat(L"\""); sendmail.concat(email_content); sendmail.concat(L"\" | "); sendmail.concat(L"mail -s \"Your job is finished\" "); SysString var(L"USER"); SysString val; Integral::getEnv(val, var); sendmail.concat(val); // convert the wide character string to a character string // exec_command_sendmail = (char*)(byte8*)sendmail; // execute the command // if (system(exec_command_sendmail) < 0) { // print debugging information // output.assign(L"error executing command: "); output.concat(sendmail); Console::put(output); return Integral::exit(); } // print this message // Console::put(L"An email has been sent to you with the above details!"); //exit gracefully // return true; } // method: getDiffTime // // arguments: // Long& start_time: (input) start time // Long& finish_time: (input) finish_time // // return: time String (difference between two values) // //returns the difference date and time as a string in mm/dd/yyyy format // String Splitter::getDiffTime(Long& start_time_a, Long& finish_time_a) { Long diff_time; diff_time = finish_time_a - start_time_a; Long seconds = (int)diff_time % 60; Long min_all = (int)diff_time / 60; Long hour = (int)min_all / 60; Long min = (int)min_all % 60; // construct the string that has to be returned // String time; time.assign(hour); time.concat(L" hours, "); time.concat(min); time.concat(L" minutes, "); time.concat(seconds); time.concat(L" seconds"); // exit gracefully // return time; } // method: generateScript // // arguments: // File& scripts_file : (input) script file // String& scripts_filename : (input) script filename // String& out_scripts : (input) out script // String& cluster_name : (input) cluster name // int list_length : (input) list length // int reminder_list : (input) reminder list // Filename& input_file : (input) input file name // Sdb& sdb_list : (input) sdb list // int32 job_index : (input)job index ID // String& command_to_execute : (input) command to execute // String& wall_time : (input) wall time // // return: logical error status // // this method pass the script file according to the cluster location. // bool8 Splitter::generateScript(File& scripts_file_a, String& scripts_filename_a, String& out_scripts_a, String& clustername_a, int list_length_a, int reminder_list_a, Filename& inputfile_a, Sdb& sdblist_a, int32 job_index_a, String& command_execute_a, String& wall_time_a) { // declar local variables // String output; // write the temporary script files for submitting to the cluster // if (!scripts_file_a.open(scripts_filename_a, File::WRITE_ONLY, File::TEXT)) { output.assign(L"\ncannot open file: "); output.concat(scripts_filename_a); output.concat(L", bailing out..."); Console::put(output); Integral::exit(); } // contents of the script are programable, the user has to just modify the // the lines under the out_scripts string according to his batch system. // This script is generated for a Sungrid queuing system // if (clustername_a.eq(PARAM_CLUSTER_TALUS)) { out_scripts_a.assign(L"#$ -S /bin/sh"); out_scripts_a.concat(L"\n\n#$ -V"); out_scripts_a.concat(L"\n#$ -N Job#"); out_scripts_a.concat(job_index_a); out_scripts_a.concat(L"\n#$ -cwd"); out_scripts_a.concat(L"\n#$ -m e"); out_scripts_a.concat(L"\numask 0002\n"); out_scripts_a.concat(L"\ncd $SGE_O_WORKDIR\n"); out_scripts_a.concat(command_execute_a); for (int j = 0; j < list_length_a; j++) { if (reminder_list_a > 0 && j == list_length_a - 1) { j--; reminder_list_a--; } out_scripts_a.concat(L" "); sdblist_a.getName(inputfile_a); out_scripts_a.concat(inputfile_a); sdblist_a.gotoNext(); } scripts_file_a.put(out_scripts_a); scripts_file_a.put(L"\n"); // close the file // scripts_file_a.close(); } // This script is generated for a PBS queuing system // else if (clustername_a.eq(PARAM_CLUSTER_EMPIRE) ||clustername_a.eq(PARAM_CLUSTER_MAVERICK)) { out_scripts_a.assign(L"#!/bin/sh"); out_scripts_a.concat(L"\n\n#PBS -V"); out_scripts_a.concat(L"\n#PBS -N Job#"); out_scripts_a.concat(job_index_a); out_scripts_a.concat(L"\n#PBS -W umask=002\n"); out_scripts_a.concat(L"\n#PBS -l nodes=1:ppn=1\n"); out_scripts_a.concat(L"\n#PBS -l walltime="); out_scripts_a.concat(wall_time_a); out_scripts_a.concat(L"\ncd $PBS_O_WORKDIR\n"); out_scripts_a.concat(command_execute_a); for (int j = 0; j < list_length_a; j++) { if (reminder_list_a > 0 && j == list_length_a - 1) { j--; reminder_list_a--; } out_scripts_a.concat(L" "); sdblist_a.getName(inputfile_a); out_scripts_a.concat(inputfile_a); sdblist_a.gotoNext(); } out_scripts_a.concat(L"\n#PBS -m e"); scripts_file_a.put(out_scripts_a); scripts_file_a.put(L"\n"); // close the file // scripts_file_a.close(); } else { Error::handle(name(), L"cluster_name - cluster_name is not correct", Error::ARG, __FILE__, __LINE__); return false; } //exit gracefully // return true; } // method: generateCommand // // arguments: // String& log_filename : (input) logfile name // String& scripts_filename : (input) script file name // String& command : (input) command // String& cluster_name : (input) cluster name // // return: logical error status // // this method pass the command according to the cluster location. bool8 Splitter::generateCommand(String& log_filename_a, String& scripts_filename_a, String& command_a, String& clustername_a) { String output; // Job command is different according to the batch system. // This script is generated for a Sungrid queuing system // build the commandline for job submission // if (clustername_a.eq(PARAM_CLUSTER_TALUS)) { command_a.assign(L"qsub -j y -o "); command_a.concat(log_filename_a); command_a.concat(L" "); command_a.concat(scripts_filename_a); } else if (clustername_a.eq(PARAM_CLUSTER_EMPIRE) ||clustername_a.eq(PARAM_CLUSTER_MAVERICK)) { command_a.assign(L"qsub -z "); command_a.concat(L"-j oe -o "); command_a.concat(log_filename_a); command_a.concat(L" -q @"); command_a.concat(clustername_a); command_a.concat(L" "); command_a.concat(scripts_filename_a); } else { Error::handle(name(), L"cluster_name - cluster_name is not correct", Error::ARG, __FILE__, __LINE__); return false; } //exit gracefully // return true; }