#
# @(#) $Id: launcher.py,v 1.97 2001/12/18 22:02:23 ivm Exp $
#
# $Log: launcher.py,v $
# Revision 1.97  2001/12/18 22:02:23  ivm
# Open /dev/null as stdin for batch processes
#
# Revision 1.96  2001/12/18 17:48:47  ivm
# Close the process-launcher pipe in launcher
# Print more accurate error message in queueadmin
#
# Revision 1.94  2001/11/20 19:42:14  ivm
# Implemented CPU and real time limits for proc. type
# Fixed launcher reconfiguration bug
#
# Revision 1.93  2001/11/11 02:47:27  ivm
# Implemented process start time in Systat and psmodule (tested)
# Implemented launcher start time comparison in lock() (not tested)
#
# Revision 1.92  2001/11/07 03:53:28  ivm
# Removed debug dump of env to stderr
#
# Revision 1.91  2001/11/06 19:29:56  ivm
# Removed persistent node holds
# More accurate priority update algorithm in case of
# single active queue
#
# Revision 1.90  2001/11/05 19:22:03  ivm
# Fixed some bugs
# Made inetractive spawner independent of UPS
#
# Revision 1.89  2001/10/27 18:00:15  ivm
# Implemented non-blocking process start
# Fixed some bugs
#
# Revision 1.88  2001/10/18 02:58:23  ivm
# Fixed bug in handling of new scratch disks in launcher.py
# Implemented runable/non-runable cache in Scheduler, Queue
#
# Revision 1.86  2001/09/25 18:41:38  ivm
# Changes for Python 2.1
#
# Revision 1.85  2001/09/10 20:44:32  ivm
# Removed debug print-outs
#
# Revision 1.84  2001/08/28 20:36:25  ivm
# Re-read kinit cmd, scratch areas every time before process start
# Do not open LogClient until successfully connected to BMGR
#
# Revision 1.83  2001/06/12 19:47:53  ivm
# Updated for Python v2.1
#
# Revision 1.82  2001/04/26 17:22:43  ivm
# Fixed some bugs
# Added doc/ReleaseNotes_v1.3.txt
#
# Revision 1.80  2001/03/15 21:47:54  ivm
# Implemented "on nodes"
# Fixed protocol version handling in lch, lchif
#
# Revision 1.79  2001/02/26 22:28:33  ivm
# Fixed kill.py
# Added RELEASE message sent from LchIF to launcher to clear held status
#
# Revision 1.78  2001/02/08 19:15:51  ivm
# Added protocol version verification
#
# Revision 1.77  2001/02/05 16:53:02  ivm
# Send create_k5_credentials from BMGR to launcher
# If no kinit command defined, but cedentials are required, hold node
#
# Revision 1.76  2001/01/25 18:59:44  ivm
# More accurate CPU time accounting for just exited processes
#
# Revision 1.74  2001/01/08 16:49:56  ivm
# Slower time unit change in Scheduler
# Implemented probing launcher -> bmgr
#
# Revision 1.73  2000/12/21 18:04:46  ivm
# Call LogClient.idle()
#
# Revision 1.70  2000/12/18 20:30:17  ivm
# Use flock in Launcher
# Put inactive queue priorities to min in Scheduler
#
# Revision 1.69  2000/11/30 20:23:17  ivm
# Fixed bugs
# Made Scheduler more conservative about unknown queues/ptypes
# Use /tmp/launcher.pid for launcher inter-locking
#
# Revision 1.68  2000/11/01 19:23:03  ivm
# Fixed minor bugs
# Updated Users Guide
#
# Revision 1.67  2000/10/26 21:29:46  ivm
# Fixed killing stuff
#
# Revision 1.66  2000/10/24 16:05:14  ivm
# Send Exec to BMGR as serialized list, not string
# Call killJob, not killSection, killSection, ... in kill.py
#
# Revision 1.65  2000/10/20 19:51:01  ivm
# Added misccmodule
# Call initgroups() before setgid() setuid() in launcher
# Copy kinit stderr, stdout to process stderr, stdout
#
# Revision 1.63  2000/10/11 19:36:58  ivm
# Added FBSNodeInfo.holdNode()
# Implemented credentials creation by Launcher
#
# Revision 1.62  2000/10/06 17:24:14  ivm
# Removed debug print-outs
#
# Revision 1.60  2000/10/04 17:28:53  ivm
# Implemented '#' loophole
#
# Revision 1.59  2000/10/02 19:14:13  ivm
# Cosmetics in NetIF error messages
# Repeat waitpid() again and again in Launcher
#
# Revision 1.58  2000/09/27 20:18:37  ivm
# Renamed Scheduler.canRun() to trigger()
#
# Revision 1.55  2000/09/12 20:07:46  ivm
# Allow no local disks specified for node classes
#
# Revision 1.54  2000/09/07 17:54:53  ivm
# Implemented dynamic modification of local scratch disk mapping
#
# Revision 1.53  2000/08/18 21:15:42  ivm
# Implemented dynamic re-configuration
#
# Revision 1.52  2000/08/08 16:19:46  ivm
# Fixed user message sending from batch process to API
#
# Revision 1.51  2000/08/03 20:54:37  ivm
# Fixed more bugs, implemented SCRATCH_<requested-resource> variables.
#
# Revision 1.49  2000/08/02 16:36:05  ivm
# Fixed numerous bugs
#
# Revision 1.48  2000/08/02 15:08:32  ivm
# Use RM.py instead or ResMgr.py
# Fix sintax error in Launcher
#
# Revision 1.47  2000/07/26 14:38:30  ivm
# Implemented RPs and multiple local disks
#
# Revision 1.46  2000/07/25 18:30:07  ivm
# Pass section and process pool dictionaries to process environment
#
# Revision 1.45  2000/07/05 19:04:09  ivm
# Removed holding due to failure creating scratch dir.
#
# Revision 1.41  2000/06/26 15:07:59  ivm
# Add -d for debug
#
# Revision 1.40  2000/06/21 15:20:39  ivm
# Do not send exited history if bmgr disconnected
#
# Revision 1.38  2000/06/19 15:57:06  ivm
# Launcher remembers self-hold and sends it to bmgr on connect
#
# Revision 1.36  2000/06/19 14:39:45  ivm
# LchIF holds node.
# Changes for improved section saving.
#
# Revision 1.34  2000/06/08 18:32:44  ivm
# Order sections by JDFSeq in FBSJobInfo.sections()
#
# Revision 1.33  2000/06/05 20:45:25  ivm
# Mase sure numeric section names come out as strings
# Restore signals in JobDB
# Fixed pid file name in Launcher
# Fixed memory leak in NetIF
#
# Revision 1.32  2000/05/30 16:15:20  ivm
# Decode special exit codes and send them to bmgr
#
# Revision 1.29  2000/05/22 16:16:00  ivm
# Check for CPU time limits
#
# Revision 1.28  2000/05/22 14:27:17  ivm
# Save own PID in /tmp/launcher.pid
#
# Revision 1.27  2000/05/16 17:04:42  ivm
# Fixed "msg"
#
# Revision 1.26  2000/05/11 17:22:01  ivm
# Added r2shm and related code.
# Added SectOutput related code
#
# Revision 1.25  2000/05/08 17:58:42  ivm
# Create scratch directory during start-up
#
# Revision 1.23  2000/04/26 15:37:19  ivm
# Send UPID back to BMGR
#
# Revision 1.22  2000/04/20 21:18:43  ivm
# Fixed inter-queue scheduling
#
# Revision 1.21  2000/04/19 14:27:32  ivm
# Fixed bug with EXITED
#
# Revision 1.20  2000/04/17 15:43:43  ivm
# Do not query Launcher about non-running processes
#
# Revision 1.19  2000/04/12 15:59:44  ivm
# Added calls to LogClient
#
# Revision 1.16  2000/04/03 22:12:59  ivm
# Implemented hard kill
#
# Revision 1.14  2000/03/31 23:13:35  ivm
# Implemented process status API - Launcher protocol
#
# Revision 1.12  2000/03/30 20:44:47  ivm
# Removed and added some printouts
#
# Revision 1.11  2000/03/07 21:04:41  ivm
# Fixed bug
#
# Revision 1.9  2000/02/22 14:42:32  ivm
# Fixed some bugs
#
# Revision 1.8  2000/02/21 18:59:08  ivm
# Added reason for killing
#
# Revision 1.7  2000/02/21 18:53:58  ivm
# Added nice(), setsid(), scratch dir. clean-up
#
# Revision 1.2  2000/02/03 20:03:06  ivm
# *** empty log message ***
#
# Revision 1.1  2000/01/26 19:52:42  ivm
# Added launcher.py
#
# Revision 1.1  1999/02/17 16:58:51  ivm
# Prototype version
#
#

from SockStream import *
from socket import *
from Systat import Systat
from LogClient import *
import select
import string
import os
import time
import signal
import sys
import fbs_misc
import pwd
import futil
import errno
import Parser
import miscc
import serialize
from Timer import Timer
import fcntl
import stat


ProtocolVersion = '1.4'

SysStat = Systat()	# Process directory
ShmCmd = 'r2shm'
Debug = 0
TmpDir = '/fbsng/tmp'

LogC = None

Version = '$Id: launcher.py,v 1.97 2001/12/18 22:02:23 ivm Exp $ + reconfig'

class	BatchProcess:

	SpecialCodes = {
		100: 	'Invalid UID or GID',
		101:	'Failed to create stdout file',
		102:	'Failed to create stderr file',
		103:	'Home directory not found',
		104:	'exec() failed',
		105:	'exec() failed',
		110:	'Error creating Kerberos credentials'
		}

#	def __init__(self, logc, bpid, size, username, out, err, nice,
#			cpulim, reallim, nlst, cmd, wddict, defwd, sect_pool_dict, 
#			proc_pool_dict):
	def __init__(self, bpid, size, username, cpulim, reallim):
		self.JobSize = size
		self.Bpid = bpid
		self.Upid = None
		self.Username = username
		self.CPULimit = cpulim
		self.RealLimit = reallim
		self.Status = None
		self.ExitCode = None
		self.Reason = ''
		self.HomeDir = ''
		self.OutFileName = ''
		self.ErrFileName = ''
		self.JobID, self.SectName, self.ProcNo = fbs_misc.decodeDotID(
			self.Bpid)
		self.StartTime = None
		self.CPUTime = 0
		self.Errt = ''
		self.ShmFile = None
		self.WDs = []
		self.DefWD = None
		self.K5CacheFile = None
		self.Cmd = []
		self.Proc2LchFD = None
				
	def log(self, level, id, msg):
		if Debug:
			print 'PROC<%s>(%s,%s): %s' % (self.Bpid, level, id, msg)
		if LogC != None:
			LogC.log(level, id, 'PROC<%s>: %s' % (self.Bpid, msg))
		else:
			print '[No Log Client] PROC<%s>: %s' % (self.Bpid, msg)
	
	def verify(self):
		try:	
			pwrec = pwd.getpwnam(self.Username)
			uid, gid = pwrec[2:4]
		except:
			msg = 'Can not get pwd record for user <%s>: %s %s' % (self.Username,
					sys.exc_type, sys.exc_value)
			self.Reason = msg
			self.Errt = 'ERRH'
			return 0, 0, 0
			
		if uid < 10 or gid < 10:
			msg = 'Invalid UID = %s and/or GID = %s' % (uid, gid)
			self.Reason = msg
			self.Errt = 'ERR'
			return 0, 0, 0
		self.HomeDir = pwrec[5]
		try: os.stat(self.HomeDir)
		except:
			msg = 'Home directory "%s" for user <%s> does not exist: %s %s' % \
				(self.HomeDir, self.Username, sys.exc_type, sys.exc_value)
			self.Errt = 'ERRH'
			self.Reason = msg
			self.log('E', 110, msg)
			self.HomeDir = None
			return 0, 0, 0
		if pwrec[1] == '*':
			msg = 'User <%s> is restricted' % self.Username
			self.Errt = 'ERR'
			self.Reason = msg
			self.log('E', 110, msg)
			return 0, 0, 0
		return 1, uid, gid

	def replace(self, str, x, y):
		i = string.find(str, x)
		lx = len(x)
		ly = len(y)
		while i >= 0:
			str = str[:i] + y + str[i+lx:]
			i = string.find(str, x, i+ly)
		return str
		
	def formatFileName(self, format, dfltname):
		str = ''
		fmt = format
		pn = '%s' % self.ProcNo
		jid = '%s' % self.JobID
		sn = self.SectName
		sid = fbs_misc.encodeSectionID(jid, sn)
		if len(fmt) == 0 or fmt == '.':
			# use default dir and name
			fmt = './'
		if fmt[0] != '/':
			fmt = self.HomeDir + '/' + fmt
		if fmt[-1] == '/':
			# use default name
			fmt = fmt + dfltname
		str = fmt
		str = self.replace(str, '%n', pn)
		str = self.replace(str, '%j', sid)
		str = self.replace(str, '%S', sn)
		str = self.replace(str, '%J', jid)
		return str

	def createEnv(self, uid, gid, out, err, defwd, wdmap):
		procnam = self.Bpid
		failure = ''
		try:
			failure = 'processing output file name pattern'
			self.OutFileName = self.formatFileName(out,
				 'FBS_%j.%n.out')
			#print 'Out file = <%s>' % self.OutFileName

			failure = 'processing error file name pattern'
			self.ErrFileName = self.formatFileName(err,
				 'FBS_%j.%n.err')
			#print 'Err file = <%s>' % self.ErrFileName
			self.WDs = []
			self.WDDict = {}
			for rwd, awd in wdmap.items():
				wd = '%s/%s_%s' % (awd, self.Bpid, rwd)
				self.WDs.append(wd)
				self.WDDict[rwd] = wd

			if not self.WDs:
				if defwd:
					self.DefWD = '%s/%s' % (defwd, self.Bpid)
					self.WDs = [self.DefWD]
			else:
				self.DefWD = self.WDs[0]

			for wd in self.WDs:
				try:	futil.rmdirrec(wd)
				except:	pass

				failure = 'creating scratch directory <%s>' % wd
				try:	os.mkdir(wd, 0700)
				except os.error, reason:
					sts, txt = reason
					if sts == errno.EEXIST:	# directory exists
						pass
					else:
						#self.HealthMon.sleep('Error creating scratch directory %s: %s' %
						#	(self.WD, reason))
						raise os.error, reason
				failure = "chown'ing scratch directory <%s>" % wd
				try:	os.chown(wd, uid, gid)
				except:
					#self.HealthMon.sleep('chown(%s): %s %s' % (self.WD, sys.exc_type, sys.exc_value))
					raise sys.exc_type, sys.exc_value

			failure = 'creating shm key file <%s>' % self.ShmFile
			os.close(os.open(self.ShmFile,os.O_CREAT,0755))
		except:
			raise sys.exc_type, (failure, sys.exc_value)

			
	#def	getStatusLine(self):
	#	f = os.popen("%s g -f %s" % (ShmCmd, self.ShmFile),'r', 1024)
	#	l = f.readline()
	#	#print 'Proc %s: status = "%s"' % (self.ProcName, l)
	#	f.close()
	#	#print 'Pipe closed'
	#	return string.strip(l)

	def doRead(self, fd, sel):
		if fd != self.Proc2LchFD:
			return
		msg = os.read(self.Proc2LchFD, 1024)
		if not msg:
			os.close(fd)
			sel.unregister(rd=[fd])
			if self.Proc2LchBuf:
				self.log('C', 1, self.Proc2LchBuf + '...(EOF)')
			return
		self.Proc2LchBuf = self.Proc2LchBuf + msg
		inx = string.find(self.Proc2LchBuf, '\n')
		while self.Proc2LchBuf and inx >= 0:
			msg = self.Proc2LchBuf[:inx]
			self.Proc2LchBuf = self.Proc2LchBuf[inx+1:]
			if msg:
				self.log('C', 1, msg)
			inx = string.find(self.Proc2LchBuf, '\n')

	def doKinit(self, cmd, cacheFile, uid, gid):
		# returns kinit exit status and stdout and stderr lines
		if type(cmd) == type(()):
			cmd = list(cmd)
		cmd = [cmd[0]] + ['-c',cacheFile] + cmd[1:]
		
		errp, errc = os.pipe()
		outp, outc = os.pipe()
		errf = os.fdopen(errp)
		outf = os.fdopen(outp)
		pid = os.fork()
		sts = -1
		if pid > 0:
			# parent, just wait for child
			os.close(outc)
			os.close(errc)
			x, sts = os.waitpid(pid, 0)
		elif pid == 0:
			# child	
			os.close(outp)
			os.close(errp)
			os.close(0)
			os.close(1)
			os.close(2)
			os.dup2(outc, 1)
			os.dup2(errc, 2)
			for fd in range(3,1024):
				try:	os.close(fd)
				except: pass
			try:	os.execvp(cmd[0], cmd)
			except:
				os.write(2, 'kinit (%s):\n   %s %s\n' % 
					(string.join(cmd), sys.exc_type, sys.exc_value))
				sys.exit(110)
		else:
			# fork error
			os.close(errp)
			os.close(outp)
			return (110<<8), [], ['kinit fork error']
		outlines = outf.readlines()
		errlines = errf.readlines()
		#print outlines
		#print errlines
		outf.close()
		errf.close()
		try:	os.chown(cacheFile, uid, gid)
		except:
			errlines.append('K5Cache chown(%s,%s,%s)\n   %s %s\n' %
				(cacheFile, uid, gid, sys.exc_type, sys.exc_value))
		return sts, outlines, errlines
					
	def start(self, sel, cmd, out, err, nice, nlist, defwd, wdmap, sect_pool_dict, 
			proc_pool_dict, create_credentials, kinit_cmd):

		if type(cmd) == type(''):
			cmd = Parser.parseWords(cmd, cvtInts=0)

		self.Cmd = cmd
		self.StartTime = time.time()
		self.CPUTime = 0
		self.Errt = ''
		self.ShmFile = '%s/.fbsng_shmkey_%s' % (TmpDir,self.Bpid)
		if kinit_cmd:
			self.K5CacheFile = '/tmp/.fbs_k5cc_%s' % self.Bpid
	

		# verify information
		sts, uid, gid = self.verify()
		if not sts:
			return -1
		# create environment
		try:	self.createEnv(uid, gid, out, err, defwd, wdmap)
		except:
			r1, r2 = sys.exc_value
			self.Reason = 'Failure %s: %s: %s' % (r1, sys.exc_type, r2)
			self.Errt = 'ERRH'
			return -1

		# if requested, issue kinit command for the user
		kinit_out = []
		kinit_error = []
		kinit_sts = 0
		if kinit_cmd:
			kinit_sts, kinit_out, kinit_error = self.doKinit(kinit_cmd,
					self.K5CacheFile, uid, gid)
			#print 'kinit_out: ', kinit_out
			#print 'kinit_error: ', kinit_error

			if create_credentials == 'required' and kinit_sts:
				msg = 'Failure creating required Kerberos credentials for user <%s>, bpid <%s>' % \
					(self.Username, self.Bpid)
				self.Reason = msg
				self.Errt = 'ERRH'
				self.log('E',200, msg)
				self.log('E',200, 'kinit command: <%s>' % string.join(kinit_cmd))
				self.log('E',200, 'cache file: <%s>' % self.K5CacheFile)
				for l in kinit_out:
					self.log('E',200, 'kinit stdout: <%s>' % string.strip(l))
				for l in kinit_error:
					self.log('E',200, 'kinit stderr: <%s>' % string.strip(l))
				return -1
				
		prd, pwr = os.pipe()

		try:	pid = os.fork()
		except:
			msg = 'Fork failed: %s %s' % (sys.exc_type, sys.exc_value)
			self.Reason = msg
			self.Errt = 'ERRH'
			return -1

		if pid > 0:
			# parent
			os.close(pwr)
			sel.register(self, rd=[prd])
			self.Proc2LchFD = prd
			self.Upid = pid
			self.Proc2LchBuf = ''
			return pid

		elif pid < 0:
			# error
			msg = 'Fork failed: %d' % pid
			self.Reason = msg
			self.Errt = 'ERRH'
			return pid

		else:
			# child
			# setuid/setgid
			os.close(prd)
			try:	
				miscc.initgroups(self.Username, gid)
				os.setgid(gid)
				os.setuid(uid)
			except:
				msg = 'Error in initgroups(%s, %s): %s %s\n' % \
					(self.Username, gid, sys.exc_type, sys.exc_value)
				os.write(pwr, msg)
				os._exit(100)

			#print 'uid/gid = %d/%d' % (os.getuid(), os.getgid())
			#print 'Creating output <%s>' % self.OutFileName
			#print 'Creating error <%s>' % self.ErrFileName

			# close all open files
			for i in range (1024):
				if i != pwr:
					try:	os.close(i)
					except:	pass

			# open stdin
			try:	fd = os.open('/dev/null', os.O_RDONLY)
			except:
				msg = 'Error opening /dev/null as stdin: %s %s\n' % \
					(sys.exc_type, sys.exc_value)
				os.write(pwr, msg)
				#print sys.exc_type, sys.exc_value
			else:
				if fd != 0:
					os.dup2(fd, 0)
					os.close(fd)

			# open stdout
			try:	fd = os.open(self.OutFileName, os.O_CREAT +
					os.O_WRONLY + os.O_TRUNC, 0666)
			except:
				#print sys.exc_type, sys.exc_value
				os._exit(101)
			if fd != 1:
				os.dup2(fd, 1)
				os.close(fd)

			#open stderr
			try:	fd = os.open(self.ErrFileName, os.O_CREAT +
					os.O_WRONLY + os.O_TRUNC, 0666)
			except:
				os._exit(102)
			if fd != 2:
				os.dup2(fd, 2)
				os.close(fd)

			# write kinit output/error if any and if needed
			if kinit_sts:
				for l in kinit_error:
					os.write(2, l)
				os.write(pwr,'kinit failed with exit status %d\n' % (kinit_sts>>8))
				os.write(2,'kinit failed with exit status %d\n' % (kinit_sts>>8))
				for l in kinit_out:
					os.write(1, l)

			# create shared memory segment for "msg"
			#try:	os.open(self.ShmFile, os.O_CREAT, 0755)
			#except: 
			#	msg = "FBSNG: Error %s %s creating shared memory key file <%s>" %\
			#		(sys.exc_type, sys.exc_value, self.ShmFile)
			#	os.write(2, msg + '\n')

			try:	os.system('%s c -f %s' % (ShmCmd,self.ShmFile))
			except:	
				msg = "FBSNG: Error %s %s creating shared memory segment with key file <%s>" %\
					(sys.exc_type, sys.exc_value, self.ShmFile)
				os.write(pwr, msg + '\n')
				os.write(2, msg + '\n')

			# cd $home
			try:	os.chdir(self.HomeDir)
			except:
				msg = "FBSNG: Error chdir(%s): %s %s" %\
					(self.HomeDir, sys.exc_type, 
					sys.exc_value)
				os.write(pwr, msg + '\n')
				os.write(2, msg + '\n')
				os.close(2)
				os.close(1)
				os._exit(103)

			ppstr = ''
			for k, v in proc_pool_dict.items():
				ppstr = ppstr + '%s:%s ' % (k, v)
			ppstr = string.strip(ppstr)
			
			spstr = ''
			for k, v in sect_pool_dict.items():
				spstr = spstr + '%s:%s ' % (k, v)
			spstr = string.strip(spstr)
			
			env = {	
					'FBS_JOB_ID':('%s' % self.JobID),
			        'FBS_JOB_SIZE':('%s' % self.JobSize),
			        'FBS_PROC_NO':('%s' % self.ProcNo),
					'FBS_SECTION_NAME':('%s' % self.SectName),
					'FBS_HOSTS':string.join(nlist),
					'HOME':('%s' % self.HomeDir),
					'FBS_PROC_POOLS':ppstr,
					'FBS_SECT_POOLS':spstr,
					'FBS_PROC_STDOUT':self.OutFileName,
					'FBS_PROC_STDERR':self.ErrFileName,
					'FBS_SHM_KEY':self.ShmFile
			}

			if self.K5CacheFile:
				env['KRB5CCNAME'] = self.K5CacheFile

			if self.DefWD:
				env['FBS_SCRATCH'] = self.DefWD

			#print self.WDDict
			for rwd, awd in self.WDDict.items():
				env['FBS_SCRATCH_%s' % rwd] = awd

			try:	os.setsid()
			except:
				msg = 'setsid() failed: %s %s\n' % (sys.exc_type, sys.exc_value)
				os.write(pwr, msg)


			if nice:
				try:	os.nice(nice)
				except: 
					msg = 'nice(%d) failed: %s %s\n' % \
						(nice, sys.exc_type, sys.exc_value)
					os.write(pwr, msg)

			# substitute leading '#' with $FBSNG_DIR
			# loop hole for interactive job spawner
			executable = cmd[0]
			keep_env = 0
			if executable and executable[0] == '#':
				executable = sys.executable
				keep_env = 1
				try:	mydir = os.environ['FBSNG_DIR'] + '/bin'
				except: 
					msg = 'Can not find FBSNG bin directory: %s %s' % \
							(sys.exc_type, sys.exc_value)
					os.write(pwr, '%s\n' % msg)
					os.write(2, 'Interactive job spawner error: %s\n' % msg)
					os.close(2)
					os.close(1)
					os._exit(105)	# error !!!
				try:
					env['PYTHONPATH'] = os.environ['PYTHONPATH']
				except: pass
				try:
					env['PYTHONHOME'] = os.environ['PYTHONHOME']
				except: pass
				cmd = ['spawner','%s/spawner.py' % mydir] + cmd[1:]

			try:
				os.write(pwr, 'exec(%s, %s)...\n' % (executable, string.join(cmd)))
				os.close(pwr)
				#os.write(2, 'env: %s\n' % env)
				os.execve(executable, cmd, env)
			except:
				msg = "FBS: Error starting user's program: %s %s" %\
					(sys.exc_type, sys.exc_value)
				os.write(2, msg + '\n')
				os.write(2, 'User command: ')
				for a in cmd:
					os.write(2, '<%s> ' % a)
				os.write(2,'\n')
				os.close(2)
				os.close(1)
				os._exit(104)	# error !!!
			# we should never come here after exec()
			os.write(2, 'Unexpected error in exec\n')
			os._exit(105)	# error !!!
		self.StartTime = time.time()
		return self.Upid
	
	def initKill(self, reason, flag):
		#print 'initKill'
		self.Reason = reason
		signo = signal.SIGINT
		self.Status = ('KILLING',time.time())
		if flag:	signo = signal.SIGKILL
		if SysStat != None:
			SysStat.update()
			if SysStat:
				self.log('D',0,'initKill: kill session and tree (%s, %s)' % 
						(self.Upid, signo))
				SysStat.killSessionAndTree(self.Upid, signo)
				return
		self.log('D',0,'initKill: SysStat is empty, killing parent (%s)' %
			self.Upid)
		try:	
			os.kill(self.Upid, signo)
			self.log('D',0,'initKill: os.kill(%s, INT)' % self.Upid)
		except:
			os.log('D',0,'initKill: kill failed: %s %s' % 
				(sys.exc_type, sys.exc_value))
		
	def finishKill(self):
		if self.Status == None: 	return
		sts, arg = self.Status
		# print 'finishKill: sts/arg=', sts, arg
		if sts == 'KILLING' and time.time() < arg + 30:
			return  # later
		if SysStat != None:
			SysStat.update()
			if SysStat:
				SysStat.killSessionAndTree(self.Upid, signal.SIGKILL)
				self.log('D',0,'finishKill: SysStat.killSessionTree(%s, KILL)' % self.Upid)
				return
		self.log('D',0,'initKill: SysStat is empty, killing parent (%s)' %
			self.Upid)
		try:	
			os.kill(self.Upid, signal.SIGKILL)
			self.log('D',0,'finishKill: os.kill(%s, KILL)' % self.Upid)
		except os.error:
			os.log('D',0,'finishKill: kill failed: %s %s' % 
				(sys.exc_type, sys.exc_value))
			

	def exited(self, code):
		self.Status = ('EXITED',time.time())
		self.ExitCode = code
		c = code >> 8
		if c in self.SpecialCodes.keys():
			self.Reason = self.SpecialCodes[c]

		# do clean-up
		if SysStat != None:
			SysStat.update()
			if SysStat:
				SysStat.killSession(self.Upid, signal.SIGKILL)

		# remove "msg" shared memory segment
		try:	os.system('%s d -f %s' % (ShmCmd,self.ShmFile))
		except:	
			msg = "Error %s %s creating shared memory segment with file <%s>" %\
				(sys.exc_type, sys.exc_value, self.ShmFile)
			self.log('E', 125, msg)
			#os.write(2, msg + '\n')

		try:	os.remove(self.ShmFile)
		except:
			msg = "Error %s %s deleting shared memory key file <%s>" %\
				(sys.exc_type, sys.exc_value, self.ShmFile)
			self.log('E', 125, msg)

		if self.K5CacheFile:
			try:
				# overwrite cache file
				f=open(self.K5CacheFile,'r+')
				f.write(' '*int(os.fstat(f.fileno())[stat.ST_SIZE]))
				f.close()
				os.remove(self.K5CacheFile)
			except:
				msg = "Error %s %s deleting K5 c.cache file <%s>" %\
					(sys.exc_type, sys.exc_value, self.K5CacheFile)
				self.log('E', 125, msg)

		# remove scratch directory
		for wd in self.WDs:
			#print 'rmdirrec(%s)' % wd
			try:	futil.rmdirrec(wd)
			except:	
				self.log('E', 125, 'Error deleting <%s>: %s %s' % 
					(wd, sys.exc_type, sys.exc_value))
				#self.HealthMon.sleep('Can not delete scratch dir')
				pass
			else:
				#self.log('D', 125, 'Scratch dir <%s> deleted.' % self.WD)
				pass

	def	getStatusLine(self):
		f = os.popen("%s g -f %s" % (ShmCmd, self.ShmFile),'r', 1024)
		l = f.readline()
		#print 'Proc %s: status = "%s"' % (self.ProcName, l)
		f.close()
		#print 'Pipe closed'
		return string.strip(l)

		
	def getStat(self):
		str = 'PROC %s ' % self.Bpid
		return str + self.getStatRec(0, self.Upid)

	def getStatRec(self, level, pid):
		str = '%d 0 0 [%s] %s\n' %\
				(self.Upid, self.getStatusLine(), string.join(self.Cmd))
		if not SysStat:
			return str
		try:	pinfo = SysStat[pid]
		except KeyError:
			return str
		str = ''
		if pinfo == None:	return str
		if level == 0:
			str = str + '%d %d %d [%s] %s\n' %\
			(pid, pinfo.cpu, pinfo.acpu, 
				self.getStatusLine(), string.join(pinfo.cmd))
			if pinfo.acpu > self.CPUTime:
				self.CPUTime = pinfo.acpu
		else:
			str = str + '%d %d %d %d %s\n' %\
				(level, pid, pinfo.cpu, pinfo.acpu, 
					string.join(pinfo.cmd))
		for pid in pinfo.Children:
			str = str + self.getStatRec(level+1, pid)
		return str

	def recordTime(self):
		#print 'recordTime(%s)' % self.Bpid,
		if SysStat:
			try:	
				pi = SysStat[self.Upid]
				#print 'pi: ', pi.__dict__,
				if pi.acpu > self.CPUTime:
					self.CPUTime = pi.acpu
					#print '     CPUTime -> ', self.CPUTime,
			except:
				pass
		#print ''
						
class	SchInterface:
	def __init__(self, lnchr, cfg, sel):
		self.Sock = None
		self.Str = None
		self.Status = 'DISCONNECTED'
		self.Lnchr = lnchr
		self.KnownProcs = {}
		self.ExitHist = {}
		self.Selector = sel
		self.SchAddr = (cfg.getValue('bmgr', '*', 'host', 'localhost'),
			cfg.getValue('bmgr', '*', 'launcher_if_port', 5557))
		self.connect()

	def log(self, level, id, msg):
		#print 'SCHIF(%s,%s): %s' % (level, id, msg)
		if LogC != None:
			LogC.log(level, id, 'SCHIF: %s' % msg)
		else:
			print '[No Log Client] SCHIF: %s' % msg
		
	def sendHello(self):
		pidstr = ''
		for bpid, upid in self.KnownProcs.items():
			pidstr = pidstr + '%s:%s,' % (bpid, upid)
		if not pidstr:
			pidstr = '_'
		if pidstr[-1] == ',':
			pidstr = pidstr[:-1]
		if self.Lnchr.Status == 'OK':
			hello = 'HELLO %s %s %s' % (ProtocolVersion, self.Lnchr.Id, pidstr)
		else:	# hold
			hello = 'HOLD %s %s %s %s' % (ProtocolVersion, self.Lnchr.Id, pidstr, 
					self.Lnchr.HoldReason)
		self.log('D', 101, 'HELLO: %s' % hello)
		ans = self.Str.sendAndRecv(hello)
		self.log('D', 102, 'Answer to HELLO: %s' % ans)
		if ans == None:
			return 0, 'BMGR disconnected: %s' % ans
		words = string.split(ans)
		if len(words) < 1 or words[0] != 'OK':
			return 0, ans
		return 1, 'OK'

	def connect(self):
		if self.Status == 'DISCONNECTED':
			self.Sock = socket(AF_INET, SOCK_STREAM)
			try:	self.Sock.connect(self.SchAddr)
			except:
				# self.log('E',0,'connect: %s, %s' % (sys.exc_type, sys.exc_value))
				self.disconnect()
				return
			self.Str = SockStream(self.Sock, '\n')
			# we have connected to the scheduler. Send him 'hello'
			sts, reason = self.sendHello()
			if sts:
				self.sendExitedHist()
				self.Selector.register(self, rd = self.Sock.fileno())
				self.Status = 'CONNECTED'
				self.log('I',200,'Connected to BMGR')
			else:
				self.log('E',200,'Error establishing connection to BMGR: %s' %
						reason)
				self.disconnect()
				return

	def disconnect(self):
		self.Str = None
		if self.Sock != None:
			self.Selector.unregister(rd = self.Sock.fileno())
			self.Sock.close()
			self.Sock = None
			# self.log('I',300,'Disconnected')
		self.Status = 'DISCONNECTED'
		time.sleep(10)

	def probe(self):
		if self.Status == 'CONNECTED' and self.Str != None:
			self.Str.probe()
			

	def sendExitedHist(self):
		for pid, info in self.ExitHist.items():
			code, reason, cpu = info
			self.Str.send('EXITED %s %s %s %s' % (pid, code, cpu, reason))
			self.log('D',0,'< (exh) <EXITED %s %s %s %s>' % (pid, code, cpu, reason))
		
	def idle(self):
		if self.Status == 'DISCONNECTED':
			self.connect()

	def doRead(self, fd, sel):
		if self.Sock == None or self.Sock.fileno() != fd:
			return
		self.Str.readMore(1000)
		#print 'run: readMore: <%s>' % self.Str.Buf
		while self.Str.msgReady():
			msg = self.Str.getMsg()
			self.log('D',0,'> (read) <%s>' % msg)
			answ = self.processMsg(msg)
			if answ != None:
				self.Str.send(answ)
				self.log('D',0,'< (read) <%s>' % answ)
		if self.Str.eof():
			# Sch disconnected.
			self.log('I',201,'BMGR disconnected')
			self.disconnect()
		else:
			self.sendExitedHist()
								
	def processMsg(self, msg):
		#print 'processMsg(%s)' % msg
		words = string.split(msg,None,1)
		l = len(words)
		if words[0] == 'START' and l > 1:
			return self.doStart(words[1])
		elif words[0] == 'KILL' and l > 1:
			return self.doKill(words[1])
		elif words[0] == 'DEL' and l > 1:
			return self.doDel(words[1])
		elif words[0] == 'RELEASE':
			return self.doRelease()
		else:
			return 'ERRH Syntax error in <%s>' % msg

	def doRelease(self):
		# RELEASE
		self.Lnchr.release()
		return None		

	def doDel(self, args):
		# DEL <pid>
		words = string.split(args)
		if len(words) < 1:
			return 'ERRH Syntax error in <%s>' % string.join(words)
		if self.ExitHist.has_key(words[0]):
			del self.ExitHist[words[0]]
		if self.KnownProcs.has_key(words[0]):
			del self.KnownProcs[words[0]]
		return None

	def doStart(self, args):
		words = string.split(args,None,13)
		bpid = words[0]
		if len(words) < 14:
			return 'ERRH %s Syntax error in START <%s>' % (bpid, args)
			
		# bpid size uid gid out err nice cpulim reallim nlist cmd
		try:
			#print words
			jobid, sname, procno = fbs_misc.decodeDotID(bpid)
			size = string.atoi(words[1])
			username = words[2]	
			krbopt = words[3]
			#gid = string.atoi(words[3])	
			out = words[4]
			err = words[5]
			nice = string.atoi(words[6])	
			cpulim = string.atoi(words[7])	
			reallim = string.atoi(words[8])	
			nlist = string.split(words[9],',')
			local_rsrc = Parser.wordsToDict(string.split(words[10],','),
								defValue=0)
			sect_pool_dict = Parser.wordsToDict(string.split(words[11],','))
			proc_pool_dict = Parser.wordsToDict(string.split(words[12],','))
			cmd, junk = serialize.deserialize(words[13])
		except:
			return 'ERRH %s Syntax error in <%s>, %s %s' % \
				(bpid, string.join(words), sys.exc_type, sys.exc_value)

		pid, errt, reason = self.Lnchr.startProc(bpid, size, username, 
				krbopt, out, err,
				nice, cpulim, reallim, nlist, cmd, local_rsrc, sect_pool_dict,
				proc_pool_dict)
		self.log('I',0,'startProc(%s): %s,%s' % (bpid,errt,reason))
		if pid > 0:
			self.KnownProcs[bpid] = pid
			return 'OK %s %d' % (bpid, pid)
		else:
			return '%s %s %s' % (errt, bpid, reason)

	def doKill(self, args):
		# <bpid> <flag> [reason]
		reason = 'Killed by BMGR'
		words = string.split(args, None, 2)
		if len(words) < 2:
			return 'ERRH Syntax error in <%s>' % args
		bpid = words[0]
		try:	flag = string.atoi(words[1])
		except: return 'ERRH Syntax error in <%s>' % args
		if len(words) > 2:
			reason = words[2]
		self.Lnchr.killProc(bpid, reason, flag)
		return 'OK'

	def procExited(self, p):
		# called by Launcher
		msg = 'EXITED %s %s %s %s' % (p.Bpid, p.ExitCode, p.CPUTime, p.Reason)
		#print 'Sending <%s>' % msg
		if self.Status == 'CONNECTED':
			self.log('D',0,'< (pex) <%s>' % msg)
			self.Str.send(msg)
		self.ExitHist[p.Bpid] = (p.ExitCode, p.Reason, p.CPUTime)
			
class	Launcher:
	def __init__(self, cfg, fcfg, id, ip, sel):
		self.FCfg = fcfg
		self.Cfg = cfg
		self.Id = id
		self.FullIP = ip
		self.Upid2Proc = {}		# upid to Process
		self.Bpid2Proc = {} 	# bpid to Process
		self.Status = 'OK'		# or 'HOLD'
		self.HoldReason = ''
		self.SchIf = SchInterface(self, self.Cfg, sel)
		self.StatIF = StatIF(self, self.Cfg, self.Id, sel)
		#self.CtlIF = ControlIF(self, self.Cfg, self.Id, sel, self.LogC)
		self.ScratchAreas = {}
		#self.FCfg.getValueDict('node_class',node_class,
		#		'local_disks')
		#self.KinitCmdTmp = self.Cfg.getValueList('launcher',self.Id,'k5_kinit_command')
		self.Sel = sel
		self.NextSA = 0
		self.updateScratchAreas()
		if not self.ScratchAreas:
			self.ScratchAreas = {}
			self.log('I',100,'No scratch area mapping is defined')
			print 'No scratch area mapping is defined'
			#raise ValueError, 'No scratch area mapping is defined'

	def initScratchAreas(self):
		for rn, sa in self.ScratchAreas.items():
			self.initScratchArea(sa)

	def initScratchArea(self, root):
		try:	futil.rmdirrec(root, keep_root=1)
		except: pass
		try:	os.mkdir(root)
		except: pass

	def updateScratchAreas(self):
		try:	self.FCfg.reReadConfig()
		except: return		# use old values
		
		node_class = self.FCfg.getValueList('node_list','*',self.Id)
		if node_class == None:
			self.log('F',100,'Can not determine node class for id=<%s>' %
						self.Id)
			print 'Can not determine node class for id=<%s>' %	self.Id
			print 'Exiting'
			raise ValueError, 'Can not determine node class for id=<%s>' % self.Id
		node_class = node_class[0]

		dict = self.FCfg.getValueDict('node_class',node_class,
				'local_disks')
		for newrn, newroot in dict.items():
			if not self.ScratchAreas.has_key(newrn) or \
					self.ScratchAreas[newrn] != newroot:
				# new scratch disk
				self.log('I',100,'Initializing new or moved scratch area %s at %s' % (
					newrn, newroot))
				self.initScratchArea(newroot)
		self.ScratchAreas = dict

	def probe(self):
		self.SchIf.probe()

	def getScratchArea(self, rsrc = None):
		if not self.ScratchAreas:
			return None
		if not rsrc:
			nd = len(self.ScratchAreas)
			i = self.NextSA % nd
			self.NextSA = (i + 1) % nd
			rsrc = self.ScratchAreas.keys()[i]
		return self.ScratchAreas[rsrc]

	def holdIt(self, reason = 'reason unknown'):
		self.HoldReason = 'Launcher: %s: %s' % (time.ctime(time.time()), reason)
		self.log('E',911,'LCH: HOLD: %s' % reason)
		self.Status = 'HOLD'

	def release(self):
		self.log('I',1,'RELEASE')
		self.Status = 'OK'
		self.HoldReason = ''
		
	def log(self, level, id, msg):
		if LogC != None:
			LogC.log(level, id, 'LCH: %s' % msg)
		else:
			print '[No Log Client] LCH: %s' % msg
		#print 'LCH(%s,%s): %s' % (level, id, msg)
			
	def idle(self):

		self.SchIf.idle()

		# check for exited processes
		
		while 1:
			try:	pid, sts = os.waitpid(-1, os.WNOHANG)
			except: break
			if pid <= 0:	break
			if self.Upid2Proc.has_key(pid):
				p = self.Upid2Proc[pid]
				self.log('I', 100, 'EXIT UPID=%d BPID=%s code=%s cpu=%s reason=%s' % 
						(pid, p.Bpid, sts, p.CPUTime, p.Reason))
				p.exited(sts)
				self.SchIf.procExited(p)
				try:	del self.Bpid2Proc[p.Bpid]
				except: pass
				try:	del self.Upid2Proc[pid]
				except: pass

	def makeKinitCommand(self, username):
		try:	self.Cfg.reReadConfig()
		except: pass	# if file can not be opened, use old values
		kinitCmdTmp = self.Cfg.getValueList('launcher',self.Id,'k5_kinit_command')
		if not kinitCmdTmp:
			return None
		lst = []
		for w in kinitCmdTmp:
			neww = string.replace(w, '%u', username)
			neww = string.replace(neww, '%h', self.Id)
			neww = string.replace(neww, '%H', self.FullIP)
			lst.append(neww)
		return lst

	def startProc(self, bpid, size, username, kinit_required, out, err, nice,
			cpulim, reallim, nlst, cmd, local_rsrc, sect_pool_dict, 
			proc_pool_dict):
		#print 'startProc(... %s %s %s)' % (local_rsrc, sect_pool_dict, proc_pool_dict)
		# called by SchIf
		wddict = {}
		self.updateScratchAreas()
		for rn in local_rsrc.keys():
			actual_disk_name = rn
			requested_disk_name = rn
			if proc_pool_dict.has_key(requested_disk_name):
				actual_disk_name = proc_pool_dict[requested_disk_name]
			if self.ScratchAreas.has_key(actual_disk_name):
				wddict[requested_disk_name] = self.ScratchAreas[actual_disk_name]

		p = BatchProcess(bpid, size, username, 
			cpulim, reallim)

		kinit_command = None
		if kinit_required != 'no':
			kinit_command = self.makeKinitCommand(username)

		if kinit_required == 'yes' and not kinit_command:
			msg = 'kinit required for user <%s>, but kinit command not defined' % username 
			self.log('E', 1, 'hold: %s' % msg)
			return -1, 'ERRH', msg

		pid = p.start(sel, cmd, out, err, nice, nlst, self.getScratchArea(),
			wddict, sect_pool_dict, proc_pool_dict,
			kinit_required, kinit_command)

		self.log('D',0,'Proc.start(): %d %s' % (pid, p.Reason))
		if pid > 0:
			self.Upid2Proc[pid] = p
			self.Bpid2Proc[bpid] = p
			return pid, '', 'OK'
		else:
			if p.Errt == 'ERRH':
				self.holdIt(p.Reason)
			return -1, p.Errt, p.Reason
			
	def getProcList(self):
		return self.Bpid2Proc
		
	def killProc(self, bpid, reason, flag):
		#print 'killProc(%s)' % bpid
		if self.Bpid2Proc.has_key(bpid):
			#print 'killProc(%s), process found' % bpid
			self.Bpid2Proc[bpid].initKill(reason, flag)
			return 1
		else:
			return 0

	def getStat(self, bpid):
		try:	proc = self.Bpid2Proc[bpid]
		except: return 'NF'
		if SysStat != None: SysStat.update()
		return proc.getStat() + '.'

	def checkTimes(self):
		if SysStat:
			SysStat.update()
			for p in self.Bpid2Proc.values():
				p.recordTime()
				if p.CPULimit > 0 and p.CPUTime > p.CPULimit:
					p.initKill('CPU time limit exceeded',0)
				if p.Status != None:
					p.finishKill()
										
class	StatIF:
	# UDP server for process status information
	def __init__(self, lch, cfg, myid, sel):
		self.Lch = lch
		self.Cfg = cfg
		self.MyId = myid
		self.Port = self.Cfg.getValue("launcher", myid, "stat_port", 6789)
		self.Sock = socket(AF_INET, SOCK_DGRAM)
		self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
		self.Sock.bind(('',self.Port))
		self.Selector = sel
		self.Selector.register(self, rd = self.Sock.fileno())

	def log(self, level, id, msg):
		if LogC != None:
			LogC.log(level, id, 'STAT: %s' % msg)
		else:
			print '[No Log Client] STAT: %s' % msg

	def doRead(self, fd, sel):
		if fd != self.Sock.fileno():	return

		try:	msg, addr = self.Sock.recvfrom(1000)
		except:
			self.log('E', 200, 'Error in doStatIO.recvfrom: %s %s' % 
				(sys.exc_type, sys.exc_value))
			return
		#print 'StatIF: received <%s>' % msg
		self.log('D', 200, 'Stat request <%s> from %s' % (msg,addr))
		words = string.split(msg)

		# Message: STAT <procid>
		if len(words) != 2:
			return
		if words[0] != 'STAT':
			return
		ans = self.Lch.getStat(words[1])
		#print 'Sending <%s>' % ans
		try:	self.Sock.sendto(ans, addr)
		except: pass

"""
class	ControlIF:
	def __init__(self, lch, cfg, myid, sel, logc):
		self.LogC = logc
		self.Lch = lch
		self.Cfg = cfg
		self.MyId = myid
		self.Port = self.Cfg.getValue("launcher", myid, "control_port", 6798)
		self.Sock = socket(AF_INET, SOCK_DGRAM)
		self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
		self.Sock.bind(('',self.Port))
		self.Selector = sel
		self.Selector.register(self, rd = self.Sock.fileno())

	def log(self, level, id, msg):
		self.LogC.log(level, id, 'CTL: %s' % msg)

	def doRead(self, fd, sel):
		if fd != self.Sock.fileno():	return

		try:	msg, addr = self.Sock.recvfrom(1000)
		except:
			self.log('E', 200, 'Error in doRead: %s %s' % 
				(sys.exc_type, sys.exc_value))
			return

		words = string.split(msg)
		if words[0] == 'RESTART':
			lch.restart()
		elif words[0] == 'SHUT':
			lch.shutDown()
"""
				
def lock(fn):
	mypid = os.getpid()
	fd = os.open(fn,os.O_RDWR+os.O_CREAT)
	fcntl.flock(fd,fcntl.LOCK_EX)
	str = os.read(fd,1000)
	str = string.strip(str)
	pid = 0
	other_start = None
	if SysStat:	SysStat.update()
	got_it = 1
	try:
		other_start = None
		words = string.split(str)
		pid = int(words[0])
		if len(words) > 1:
			other_start = int(words[1])
		got_it = 0
	except:
		pass
	
	got_it = got_it or pid == mypid

	if not got_it:
		try:	os.kill(pid, 0)
		except:
			got_it = 1

	if not got_it and SysStat and other_start:
		try:	pi = SysStat[pid]
		except: 
			got_it = 1
		else:	
			got_it = abs(pi.stime - other_start) > 10
				
	if got_it:
		pid = 0
		os.lseek(fd,0,0)
		if SysStat:
			str = '%d %d\n' % (mypid, SysStat[mypid].stime)
		else:
			str = '%d\n' % mypid
		os.write(fd,str)
		os.ftruncate(fd,len(str))
	fcntl.flock(fd,fcntl.LOCK_UN)
	os.close(fd)

	return pid

if __name__ == '__main__':
	from config import *
	from Selector import *
	import getopt
	
	#global SysStat
	#global MyId
	#global LogC
	#global Debug

	sel = Selector()
		
	try:	opts, args = getopt.getopt(sys.argv[1:], 'dn:c:f:')
	except getopt.error, msg:
		print msg
		print 'Usage: launcher [-c <cfg>] [-n <node-name>] [-d]'
		sys.exit(2)

	cfg = None
	fcfg = None
	myhost = ''
	for opt, val in opts:
		if opt == '-c':
			cfg = ConfigFile(val)
		elif opt == '-f':
			fcfg = ConfigFile(val)
		elif opt == '-n':
			myhost = val
		elif opt == 'd':
			Debug = 1

	if cfg == None:
		try:
			cfg = os.environ['FBS_CONFIG']
		except:
			print 'Daemon configuration file undefined'
			sys.exit(1)
		cfg = ConfigFile(cfg)

	if fcfg == None:
		try:
			fcfg = os.environ['FBS_FARM_CONFIG']
		except:
			print 'Daemon configuration file undefined'
			sys.exit(1)
		fcfg = ConfigFile(fcfg)

	if not myhost:
		myhost = gethostname()
	domain = cfg.getValue('global','*','domain')
	if domain != None:
		myhost, dmn = fbs_misc.stripDomain(myhost, domain)

	# get full IP address
	ip = gethostname()
	ip = gethostbyaddr(gethostbyname(ip))[0]

	# check UID
	if os.getuid() != 0:
		print 'Launcher must run as root'
		sys.exit(1)

	SysStat.update()
	if SysStat:
		systs = 'OK'
	else:
		systs = 'unavailable'

	other_pid = lock('%s/launcher.pid' % TmpDir)
	if other_pid:
		print 'Launcher is already running, pid = %s' % other_pid
		sys.exit(1)

	l = Launcher(cfg, fcfg, myhost, ip, sel)

	print 'Launcher object created'
			
	log_ignore = cfg.getValue('launcher',myhost,'log_ignore','')
	LogC = LogClient(cfg, 'lch', '%s.%s' % (myhost, os.getpid()), 
				log_ignore)

	msg = 'Launcher version: %s' % Version
	print msg
	LogC.log('X',0,msg)

	msg = 'Launcher started with host name [%s], SysStat %s' %\
		(myhost, systs)
	print msg
	LogC.log('X',0,msg)

	timer = Timer()
	timer.add(l.checkTimes, 0, 10)
	timer.add(l.probe, 0, 180)

	while 1:
		sel.select(5)
		l.idle()
		LogC.idle()
		timer.run()			
