#
# @(#) $Id: lchif.py,v 1.76 2003/09/22 19:30:19 ivm Exp $
#
# $Author: ivm $
#
# $Log: lchif.py,v $
# Revision 1.76  2003/09/22 19:30:19  ivm
# Fixed: nodeUp will send only relevant BPIDs to each section
#
# Revision 1.75  2003/08/22 17:25:09  ivm
# Tested min. nice for proctype
#
# Revision 1.74  2003/08/20 18:58:57  ivm
# Implemented CPU power, round-robin-over-users scheduling inside queuei,
# other minor things.
#
# Revision 1.73  2001/10/27 18:00:15  ivm
# Implemented non-blocking process start
# Fixed some bugs
#
# Revision 1.72  2001/04/26 17:22:43  ivm
# Fixed some bugs
# Added doc/ReleaseNotes_v1.3.txt
#
# Revision 1.71  2001/03/15 21:47:54  ivm
# Implemented "on nodes"
# Fixed protocol version handling in lch, lchif
#
# Revision 1.70  2001/02/26 22:28:34  ivm
# Fixed kill.py
# Added RELEASE message sent from LchIF to launcher to clear held status
#
# Revision 1.69  2001/02/08 19:15:51  ivm
# Added protocol version verification
#
# Revision 1.68  2001/02/05 18:34:34  ivm
# Fixed bug with sending krbopt
#
# Revision 1.66  2001/02/05 15:40:58  ivm
# *** empty log message ***
#
# Revision 1.65  2001/01/24 21:19:13  ivm
# Overwrite K5 c.cache file on process exit in Launcher
# Call nodeUp after the node interface is created in LchIF
#
# Revision 1.64  2001/01/22 16:52:39  ivm
# Notify all sections mentioned by launcher on node-up
#
# Revision 1.63  2001/01/09 17:03:14  ivm
# Fixed numerous bugs in farm_history
#
# Revision 1.62  2000/12/05 21:42:11  ivm
# Attempt to re-connect in FBSClient
# Fixed username -> uid mapping in status.py
# v1_2a_1
#
# Revision 1.59  2000/11/08 20:27:01  ivm
# Implemented FBSClient.getServerOptions()
# Fixed bugs
#
# Revision 1.58  2000/10/24 16:05:14  ivm
# Send Exec to BMGR as serialized list, not string
# Call killJob, not killSection, killSection, ... in kill.py
#
# Revision 1.57  2000/10/11 19:36:59  ivm
# Added FBSNodeInfo.holdNode()
# Implemented credentials creation by Launcher
#
# Revision 1.56  2000/10/06 17:24:14  ivm
# Removed debug print-outs
#
# Revision 1.55  2000/10/06 00:27:44  ivm
# Fixed bug in Lchif.nodeUp() with unsubscribe
# Use PID in bmgr's id for logger
#
# Revision 1.54  2000/09/27 20:18:37  ivm
# Renamed Scheduler.canRun() to trigger()
#
# Revision 1.53  2000/09/20 14:02:44  ivm
# Fixed minor bugs
#
# Revision 1.52  2000/09/12 15:00:00  ivm
# Implemented createLocalResource method and all related changes
#
# Revision 1.51  2000/08/30 19:52:41  ivm
# Disconnect node on removal
# Fixed Usage printing in config
#
# Revision 1.50  2000/08/24 16:33:56  ivm
# Fixed some bugs
#
# Revision 1.49  2000/08/03 20:54:38  ivm
# Fixed more bugs, implemented SCRATCH_<requested-resource> variables.
#
# Revision 1.47  2000/08/03 18:50:25  ivm
# Use two dictionaries for RP to UR in startProcess
#
# Revision 1.46  2000/07/26 14:38:31  ivm
# Implemented RPs and multiple local disks
#
# Revision 1.45  2000/07/25 18:30:07  ivm
# Pass section and process pool dictionaries to process environment
#
# Revision 1.44  2000/06/28 21:31:13  ivm
# Cache BMGR's IP address to avoid DNS communication in FBSClient
# Re-arranged debug messages in LCHIF
#
# Revision 1.42  2000/06/21 14:43:27  ivm
# Added killJob()
# Optimized EXITED/DEL part of launcher protocol
#
# Revision 1.40  2000/06/19 21:03:56  ivm
# Watch for unknown parent processes in Systat
# Catch Unknown Host exception in LchIF
#
# Revision 1.39  2000/06/19 19:57:12  ivm
# Added sect_output to ui/Makefile
# Fixed LCHIF
#
# Revision 1.38  2000/06/19 15:57:06  ivm
# Launcher remembers self-hold and sends it to bmgr on connect
#
# Revision 1.36  2000/06/19 14:39:45  ivm
# LchIF holds node.
# Changes for improved section saving.
#
# Revision 1.35  2000/06/08 18:32:44  ivm
# Order sections by JDFSeq in FBSJobInfo.sections()
#
# Revision 1.34  2000/05/31 15:13:09  ivm
# Removed debug messages from NetIF, fixed Queue.
# Implemented probing in LaucherIF
#
# Revision 1.33  2000/05/08 16:32:31  ivm
# Fixed failure return from Lch.startProcess()
#
# Revision 1.32  2000/04/26 15:37:20  ivm
# Send UPID back to BMGR
#
# Revision 1.31  2000/04/19 14:27:32  ivm
# Fixed bug with EXITED
#
# Revision 1.30  2000/04/17 15:43:43  ivm
# Do not query Launcher about non-running processes
#
# Revision 1.29  2000/04/10 20:46:16  ivm
# Fixed bug with making node down
#
# Revision 1.25  2000/04/03 22:13:00  ivm
# Implemented hard kill
#
# Revision 1.22  2000/03/30 20:53:11  ivm
# Added Scheduler.canRun() to NetIF and LchIF
#
# Revision 1.21  2000/03/30 20:44:47  ivm
# Removed and added some printouts
#
# Revision 1.20  2000/03/14 16:38:10  ivm
# Fixed start()
#
# Revision 1.19  2000/03/07 17:28:13  ivm
# Fixed bug with generating infinite EOMs
#
# Revision 1.18  2000/02/29 22:38:25  ivm
# Added LaucherIF.getSubList()
#
# Revision 1.17  2000/02/22 14:47:51  ivm
# Reject new connection if the node is already connected
#
# Revision 1.16  2000/02/22 14:42:32  ivm
# Fixed some bugs
#
# Revision 1.15  2000/02/21 18:37:07  ivm
# Fixed bugs in KILL
#
# Revision 1.13  2000/02/21 17:53:57  ivm
# Added killing, primitive version, no tree killing or
# grace period
#
# Revision 1.11  2000/02/15 16:51:15  ivm
# *** empty log message ***
#
# Revision 1.10  2000/02/10 21:45:09  tlevshin
# commented call for section.nodeUp for now
#
# Revision 1.9  2000/02/09 23:38:06  tlevshin
# change unsubHostNotify to unsubNodeNotify
#
# Revision 1.8  2000/02/09 21:45:48  tlevshin
# fixed decodeDotId (should be ID)
#
# Revision 1.7  2000/02/09 21:41:41  tlevshin
# *** empty log message ***
#
# Revision 1.6  2000/02/09 21:03:45  tlevshin
# added two new functions unpackHello,procExit
# fixed some minor bugs
#
# Revision 1.5  2000/02/08 09:41:11  ivm
# Fixed bug in startProcess()
#
# Revision 1.4  2000/02/03 20:03:07  ivm
# *** empty log message ***
#
# Revision 1.2  2000/01/26 19:52:23  ivm
# Added SecParam fields to START command
#
# Revision 1.1  1999/12/24 17:11:23  ivm
# Added lchif.py
#
#

from TCPServer import *
from Selector import *
from SockStream import *
import string
import Parser
import bmgr_global
import fbs_misc
import RM
import serialize

AcceptedProtocols = ['1.3','1.4']

class	_HostIF:
	def __init__(self, nname, protov, lchif, str, sel, sock, addr):
		self.Sock = sock
		self.Addr = addr
		self.Str = str
		self.Sel = sel
		self.LchIF = lchif
		self.Node = nname
		self.ProtocolVersion = protov
		self.ProcessStartedCache = {}
		self.FailureTimes = []
		sel.register(self, rd=self.Sock.fileno())

	def log(self, level, msg):
		bmgr_global.G_LogClient.log(level, 1, 'LchIF/HostIF(%s): %s' %
			(self.Node, msg))

	def recordFailure(self, t):
		self.FailureTimes.append(t)

	def failuresSince(self, tbegin):
		return len(filter(lambda x, y=tbegin: x >= y, self.FailureTimes))			

	def clearFailures(self, tbefore):
		self.FailureTimes = filter(lambda x, y=tbefore: x >= y, self.FailureTimes)		

	def doRead(self, fd, sel):
		#print '_HostIF(%s): doRead(%d), sock fileno = %d' % \
		#	(self.Node, fd, self.Sock.fileno())
		if fd != self.Sock.fileno():
			return
		self.Str.readMore(1000)
		while self.Str.msgReady():
			msg = self.Str.getMsg()
			self.log('D', 'Rcvd: <%s>' % msg)
			words = Parser.parseWords(msg)
			if not words:	continue
			if words[0] == 'EXITED':			
				pid = words[1]
				self.LchIF.procExit(self.Node, msg)
				self.Str.send('DEL %s' % pid)
				self.log('D','Sent: <%s>' % ('DEL %s' % pid))
			elif words[0] == 'UP':
				# send ACK and ignore
				self.Str.send('UDACK %s' % msg)
				self.log('D','Sent: <%s>' % ('UDACK %s' % msg))
		if self.Str.eof():
			self.disconnect()

	def disconnect(self):
		if self.Sock != None:
			self.log('I','disconnecting')
			self.Sel.unregister(rd=self.Sock.fileno())
			self.Sock.close()
			self.Str = None
			self.Sock = None
			self.LchIF.nodeDown(self.Node)

	def release(self):
		if self.Str == None:
			return 0, 'Node not connected'
		self.Str.send('RELEASE')		
		return 1, 'OK'

	def startProcess(self, jobid, sname, procno, params, nodes,
				local_rsrc, sect_pool_dict, proc_pool_dict,
				do_not_wait = 0):
		if self.Str == None:
			return -1, 'ERRH', 'Node disconnected'
		if do_not_wait and self.ProtocolVersion < '1.4':
			return -1, 'ERRH', 'Non-blocking start is not supported by the Launcher'
		procid = fbs_misc.encodeProcID(jobid, sname, procno)
		# START procid size uid gid stdout stderr -
		#	nice cpulim reallim nodes exec
		nlist = ''

		for n in nodes:
			nlist = nlist + string.strip(n) + ','
		if nlist[-1] == ',':
			nlist = nlist[:-1]

		ppstr = ','
		for k, v in proc_pool_dict.items():
			ppstr = ppstr + '%s:%s,' % (k,v)

		spstr = ','
		for k, v in sect_pool_dict.items():
			spstr = spstr + '%s:%s,' % (k,v)

		lrstr = ','
		for k, v in local_rsrc.items():
			lrstr = lrstr + '%s:%s,' % (k,v)

		krbopt = bmgr_global.G_ServerCfg.getValue('user_profile',params.Username,
				'create_k5_credentials','optional')

		cn = bmgr_global.G_ResourceManager.getClassOfNode(self.Node)
		junk, junk, junk, power = bmgr_global.G_ResourceManager.getNodeClass(cn)

		parstr = '%s %d %s %s %s %s %d %d %d %s %s %s %s %s' % (
			procid, params.NProc, params.Username, krbopt,
			params.Stdout, params.Stderr, params.Nice, 
			params.CPUTimeLimit/power, params.RealTimeLimit/power, nlist, 
			lrstr, spstr, ppstr, serialize.serialize(params.Exec))
		self.Str.send('START %s' % parstr)
		self.log('D','Sent: <%s>' % ('START %s' % parstr))
		if do_not_wait:
			return 0, 'OK', 'OK'
		else:
			return self.waitProcessStart(procid)

	def waitProcessStart(self, bpid, tmo = 180):
		
		t0 = time.time()
		first_time = 1
		while not self.ProcessStartedCache.has_key(bpid) and \
						not self.Str.eof():

			if not first_time and time.time() > t0 + tmo:
				return -1, 'ERRH', 'Launcher is not responding'
			first_time = 0
			try:	ans = self.Str.recv(tmo = tmo)
			except: 
				# timed-out
				return -1, 'ERRH', 'Launcher is not responding'

			#print 'LIF < %s: <%s>' % (self.Node, ans)
			if not ans: continue
			words = string.split(ans, None, 1)
			if not words[0] in ['OK','ERR','ERRH']:	continue
			sts = words[0]
			procid = bpid
			upid = ''
			reason = ''
			if self.ProtocolVersion >= '1.4':
				words = string.split(ans, None, 2)
				procid = words[1]
				reason = words[2]
			else:
				reason = words[1]
				
			if sts == 'OK':
				upid = int(reason)
			self.ProcessStartedCache[procid] = (sts, upid, reason)
			
		try:	sts, upid, reason = self.ProcessStartedCache[bpid]
		except KeyError:
			# it must be EOF
			self.disconnect()
			return -1, 'ERRH', 'Node disconnected'

		del self.ProcessStartedCache[bpid]
		if sts == 'OK':
			return string.atoi(reason), 'OK', ''
		else:
			return -1, sts, reason

	def killProcess(self, jobid, sname, procno, flag):
		if self.Str == None:
			return 0, 'ERRH', 'Node disconnected'
			
		procid = fbs_misc.encodeProcID(jobid, sname, procno)
		# KILL procid
		self.Str.send('KILL %s %s' % (procid, flag))
		self.log('D','Sent: <%s>' % ('KILL %s' % procid))
		while not self.Str.eof():
			try:	ans = self.Str.recv(tmo=180)
			except:
				return 0, 'ERRH', 'Launcher is not responding'
			self.log('D', 'Rcvd: <%s>' % ans)
			if not ans: continue
			words = Parser.parseWords(ans)
			if not words[0] in ['OK','ERR','ERRH']:	
				continue
			if words[0] == 'OK':
				return 1, 'OK', ''
			else:
				return 0, 'ERRH', string.join(words[1:])
		self.disconnect()
		return 0, 'ERRH', 'Node disconnected'
		
	def probe(self):
		if self.Str != None:
			self.Str.probe()
			
class LauncherIF(TCPServer):
	def __init__(self, cfg, sel):
		port = cfg.getValue('bmgr','*','launcher_if_port',5555)
		TCPServer.__init__(self, port, sel, enabled = 0)
		self.JobFinder = bmgr_global.G_JobFinder
		self.ResourceMgr = bmgr_global.G_ResourceManager
		self.HostDict = {}
		self.SectList = {}
		self.ProcList = {}	# dict[nname] = [pid, pid, ...]
		self.Domain = cfg.getValue('global', '*', 'domain')
		self.FailureCountThreshold = cfg.getValue('bmgr', '*', 'max_node_failures')
		self.FailureHistoryInterval = cfg.getValue('bmgr', '*', 'failures_count_interval')
						
	def log(self, level, msg):
		bmgr_global.G_LogClient.log(level, 1, 'LchIF: %s' % (msg,))

	def getSubList(self, nname):
		if self.SectList.has_key(nname):
			return self.SectList[nname]
		else:
			return []
		
	def subNodeNotify(self, sectId, nname):
		#print 'LCHIF: subscribe(%s, %s)' % (sectId, nname)
		if not self.SectList.has_key(nname):
			self.SectList[nname] = []
		if not sectId in self.SectList[nname]:
			self.SectList[nname].append(sectId)
		
	def unsubNodeNotify(self, sectId, nname = '*'):
		#print 'LCHIF: unsubscribe(%s, %s)' % (sectId, nname)
		if nname == '*':
			for nname in self.SectList.keys():
				try:	self.SectList[nname].remove(sectId)
				except: pass
		elif self.SectList.has_key(nname):
			try:	self.SectList[nname].remove(sectId)
			except: pass

	def nodeUp(self, nname, procDict):
		try:	lst = self.SectList[nname]
		except: lst = []
		slst = lst[:]		# make a copy, unsubNode... may remove some elements
		self.log('I','Node <%s> UP. Sections: %s. Processes: %s.' % (nname,
				string.join(lst,','), procDict))
	
		#print 'nodeUp: sections for %s: %s' % (nname, lst)
		self.ProcList[nname] = []
		# add sections corresponding to the bpids to the list
		for bpid in procDict.keys():
			try:	jid, sname, procno = fbs_misc.decodeDotID(bpid)
			except:
				continue		# garbage
			self.ProcList[nname].append(bpid)
			sid = fbs_misc.encodeSectionID(jid, sname)
			if not sid in slst:
				slst.append(sid)

		# notify subscribed + reported by launcher sections
		for sid in slst:
			s = self.JobFinder.getSection(sid)
			if s == None:
				#print 'nodeUp: section %s not found' % sid
				self.unsubNodeNotify(sid)
				continue
			#print 'LIF: node %s up notification sent to section %s: %s' % \
			#	(nname, sid, procList)
			dict = {}
			for bpid, upid in procDict.items():
				try:	jid, sname, procno = fbs_misc.decodeDotID(bpid)
				except:
					continue		# garbage
				if sid == fbs_misc.encodeSectionID(jid, sname):
					dict[bpid] = upid
			s.nodeUp(nname, dict)

		# Find and kill all orphan processes
		# Fill ProcList as we go
		for bpid in procDict.keys():
			try:	jid, sname, procno = fbs_misc.decodeDotID(bpid)
			except:
				continue		# garbage
			sid = fbs_misc.encodeSectionID(jid, sname)
			s = bmgr_global.G_JobFinder.getSection(sid)
			if s == None:
				self.log('I','Killing orphan process <%s>' % bpid)
				self.killProcess(nname, sid, procno)

		self.HostDict[nname].clearFailures(time.time())

	def nodeDown(self, nname):
		self.ProcList[nname] = []
		try:	lst = self.SectList[nname][:]
		except: lst = []
		try:	proclist = self.ProcList[nname][:]
		except: proclist = []
		self.log('I','Node <%s> DOWN. Sections: %s. Processes: %s.' % (nname,
				string.join(lst,','), string.join(proclist,',')))
		for sid in lst:
			s = self.JobFinder.getSection(sid)
			if s == None:
				self.unsubNodeNotify(sid)
				continue
			s.nodeDown(nname)
		try:	del self.SectList[nname]
		except: pass
		self.ResourceMgr.setNodeStatus(nname, 'down')		
		#print 'LIF: removing node %s' % nname
		try:	del self.HostDict[nname]
		except: pass

	def holdNode(self, nname, reason = 'reason unknown'):
		if reason[:9] != 'Launcher:':
			reason = 'LauncherIF: %s: %s' % (time.ctime(time.time()), reason)
		try:	self.ResourceMgr.holdNode(nname, reason)
		except: pass
		self.log('I', 'Hold node <%s>, reason: <%s>' % (nname, reason))
		
	def createClientInterface(self, sock, addr, sel):
		#print 'Connection request from launcher at ', addr, ' fd = %d' % \
		#	sock.fileno()
		str = SockStream(sock, '\n')
		msg = str.recv(1000)
		self.log('I','HELLO from %s: <%s>' % (addr, msg))
		if not msg:
			str.send('ERR F Usage: HELLO <node-name> <pid> ...')
			sock.close()
			return
		words = string.split(msg)
		if len(words) < 1:
			str.send('ERR F Usage: HELLO <node-name> <pid> ...')
			sock.close()
			return
		sts, protov, nname, pdict, reason = self.unpackHello(msg)
		if nname == None or pdict == None:
			str.send('ERR F Usage: (HELLO|HOLD) <node-name> <pid>:<upid>,... <reason>')
			sock.close()
			return
	
		if not protov in AcceptedProtocols:
			str.send('ERR F Wrong protocol version <%s>' % protov)
			sock.close()
			return

		if self.HostDict.has_key(nname):
			str.send('ERR F Already connected')
			self.log('I','Already connected')
			sock.close()
			self.HostDict[nname].probe()		# in case existing connection
												# is zombie
			return
		try:	self.ResourceMgr.setNodeStatus(nname, 'up')
		except KeyError:
			str.send('ERR F Unknown node %s' % nname)
			self.log('I','Unknown node <%s>' % nname)
			sock.close()
			return
		except:
			str.send('ERR F Unknown error')
			sock.close()
			return
		str.send('OK')
		if sts == 'HOLD':
			self.holdNode(nname, reason)
		else:
			bmgr_global.G_Scheduler.trigger()
		self.log('I','Connection accepted')
		hif = _HostIF(nname, protov, self, str, sel, sock, addr)
		self.HostDict[nname] = hif
		self.nodeUp(nname, pdict)

	def unpackHello(self,msg):
		# HELLO <version> <node-name> <bpid>:<upid>,... 
		# HOLD <version> <node-name> <bpid>:<upid>,... <reason>
		words, rest = Parser.parseWords(msg, maxWords=4)
		if len(words) < 4:
			return None, None, None, None, None
		hn = words[2]
		if self.Domain:
			hn, dummy = fbs_misc.stripDomain(hn, self.Domain)
		#print 'LIF: unpackHello: %s' % words
		pids = []
		if words[3] != '_':
			pids = string.split(words[3],',')
		return words[0], words[1], hn, Parser.wordsToDict(pids, defValue=0), rest

	def releaseNode(self, nname):
		if not self.HostDict.has_key(nname):
			self.log('E','releaseNode: node <%s> unavailable' % nname)
			return 0, 'Node <%s> unavailable' % nname
		self.HostDict[nname].clearFailures(time.time())
		sts, reason = self.HostDict[nname].release()
		return sts, reason

	def startProcess(self, nname, sectid, procno, params, nodes,
			local_rsrc = {}, s_rp2ur = {}, pg_rp2ur = {},
			pl_rp2ur = {}, do_not_wait = 0):
		jobid, sname = fbs_misc.decodeDotID(sectid)
		bpid = fbs_misc.encodeProcID(jobid, sname, procno)
		self.log('I','startProcess(%s) on <%s>' % (bpid, nname))
		if not self.HostDict.has_key(nname):
			self.log('E','startProcess: node <%s> unavailable' % nname)
			return 0, 'Node <%s> unavailable' % nname
		p_rp2ur = {}
		for k, v in pg_rp2ur.items() + pl_rp2ur.items():
			p_rp2ur[k] = v
		pid, err, reason = self.HostDict[nname].startProcess(jobid, sname, procno,
				params, nodes, local_rsrc, s_rp2ur, p_rp2ur, do_not_wait)
		self.log('I','%s.startProcess(bpid=%s, do_not_wait=%s): pid, err, reason = %s, %s, %s' % (
				nname, bpid, do_not_wait, pid, err, reason))
		if pid > 0:
			self.ProcList[nname].append(bpid)
		if err == 'ERRH':	self.holdNode(nname, reason)
		if not do_not_wait and pid < 0:
			pid = 0
		return pid, reason

	def waitForProcessStart(self, nname, sectid, procno, tmo = 180):
		if not self.HostDict.has_key(nname):
			self.log('E','waitForProcessStart: node <%s> unavailable' % nname)
			return 0, 'Node <%s> unavailable' % nname
		jobid, sname = fbs_misc.decodeDotID(sectid)
		bpid = fbs_misc.encodeProcID(jobid, sname, procno)
		upid, err, reason = self.HostDict[nname].waitProcessStart(bpid, tmo)
		self.log('I','%s.waitForProcessStart(%s): pid, err, reason = %s, %s, %s' % (
			nname, bpid, upid, err, reason))
		if upid > 0:
			if not bpid in self.ProcList[nname]:
				self.ProcList[nname].append(bpid)
		else:
			upid = 0
			if err == 'ERRH':	self.holdNode(nname, reason)
		return upid, reason
		
	def killProcess(self, nname, sectid, procno, flag = 0):
		self.log('I','killProcess(%s, %s.%s) flag=%s' % 
				(nname, sectid, procno, flag))
		if not self.HostDict.has_key(nname):
			return 0, 'Node unavailable'
		jobid, sname = fbs_misc.decodeDotID(sectid)
		sts, err, reason = self.HostDict[nname].killProcess(jobid, sname, procno, flag)
		if not sts and err == 'ERRH':
			self.holdNode(nname, reason)
		return sts, reason
	
	def enable(self):
		self.enableServer()

	def procExit(self, nname, msg):
		# EXITED <bpid> <code> <cpu> [<reason>]
		words, reason = Parser.parseWords(msg, maxWords=4)
		words = words[1:]	# skip EXITED
		bpid, code, cpu = tuple(words)
		self.log('I','procExit(%s, code=%s, cpu=%s, reason=%s' %
			(bpid, code, cpu, reason))

		jobid, sname, procno = fbs_misc.decodeDotID(bpid)
		try:	self.ProcList[nname].remove(bpid)
		except: pass
		s = self.JobFinder.getSection(fbs_misc.encodeSectionID(jobid, sname))
		if s:
			s.procExit(procno, code, cpu, reason)
			bmgr_global.G_Scheduler.procExit(s, nname)
		if code and self.FailureCountThreshold and self.FailureHistoryInterval:
			n = self.HostDict[nname]
			t = time.time()
			n.recordFailure(t)
			fc = n.failuresSince(t - self.FailureHistoryInterval)
			if fc >= self.FailureCountThreshold:
				self.holdNode(nname, 
					"Excessive process failure rate (%s over last %s seconds)" % (fc, self.FailureHistoryInterval))
			n.clearFailures(t - self.FailureHistoryInterval)
			
	def getProcList(self, nname):
		if self.ProcList.has_key(nname):
			return self.ProcList[nname][:]	# deep copy
		else:
			return []

	def probe(self):
		for hn, hif in self.HostDict.items():
			hif.probe()

	def disconnectNode(self, nn):
		self.log('I','disconnectNode(%s)' % nn)
		if self.HostDict.has_key(nn):
			self.HostDict[nn].disconnect()
