grpc: jobset.py Source File

Go to the documentation of this file.
 # Copyright 2015 gRPC authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Run a group of subprocesses and then finish."""
  
 import errno
 import logging
 import multiprocessing
 import os
 import platform
 import re
 import signal
 import subprocess
 import sys
 import tempfile
 import time
  
 # cpu cost measurement
 measure_cpu_costs = False
  
 _DEFAULT_MAX_JOBS = 16 * multiprocessing.cpu_count()
 # Maximum number of bytes of job's stdout that will be stored in the result.
 # Only last N bytes of stdout will be kept if the actual output longer.
 _MAX_RESULT_SIZE = 64 * 1024
  
  
 # NOTE: If you change this, please make sure to test reviewing the
 # github PR with http://reviewable.io, which is known to add UTF-8
 # characters to the PR description, which leak into the environment here
 # and cause failures.
 def strip_non_ascii_chars(s):
     return ''.join(c for c in s if ord(c) < 128)
  
  
 def sanitized_environment(env):
     sanitized = {}
     for key, value in list(env.items()):
         sanitized[strip_non_ascii_chars(key)] = strip_non_ascii_chars(value)
     return sanitized
  
  
 def platform_string():
     if platform.system() == 'Windows':
         return 'windows'
     elif platform.system()[:7] == 'MSYS_NT':
         return 'windows'
     elif platform.system() == 'Darwin':
         return 'mac'
     elif platform.system() == 'Linux':
         return 'linux'
     else:
         return 'posix'
  
  
 # setup a signal handler so that signal.pause registers 'something'
 # when a child finishes
 # not using futures and threading to avoid a dependency on subprocess32
 if platform_string() == 'windows':
     pass
 else:
  
     def alarm_handler(unused_signum, unused_frame):
         pass
  
     signal.signal(signal.SIGCHLD, lambda unused_signum, unused_frame: None)
     signal.signal(signal.SIGALRM, alarm_handler)
  
 _SUCCESS = object()
 _FAILURE = object()
 _RUNNING = object()
 _KILLED = object()
  
 _COLORS = {
     'red': [31, 0],
     'green': [32, 0],
     'yellow': [33, 0],
     'lightgray': [37, 0],
     'gray': [30, 1],
     'purple': [35, 0],
     'cyan': [36, 0]
 }
  
 _BEGINNING_OF_LINE = '\x1b[0G'
 _CLEAR_LINE = '\x1b[2K'
  
 _TAG_COLOR = {
     'FAILED': 'red',
     'FLAKE': 'purple',
     'TIMEOUT_FLAKE': 'purple',
     'WARNING': 'yellow',
     'TIMEOUT': 'red',
     'PASSED': 'green',
     'START': 'gray',
     'WAITING': 'yellow',
     'SUCCESS': 'green',
     'IDLE': 'gray',
     'SKIPPED': 'cyan'
 }
  
 _FORMAT = '%(asctime)-15s %(message)s'
 logging.basicConfig(level=logging.INFO, format=_FORMAT)
  
  
 def eintr_be_gone(fn):
     """Run fn until it doesn't stop because of EINTR"""
     while True:
         try:
             return fn()
         except IOError as e:
             if e.errno != errno.EINTR:
                 raise
  
  
 def message(tag, msg, explanatory_text=None, do_newline=False):
     if message.old_tag == tag and message.old_msg == msg and not explanatory_text:
         return
     message.old_tag = tag
     message.old_msg = msg
     if explanatory_text:
         if isinstance(explanatory_text, bytes):
             explanatory_text = explanatory_text.decode('utf8', errors='replace')
     while True:
         try:
             if platform_string() == 'windows' or not sys.stdout.isatty():
                 if explanatory_text:
                     logging.info(explanatory_text)
                 logging.info('%s: %s', tag, msg)
             else:
                 sys.stdout.write(
                     '%s%s%s\x1b[%d;%dm%s\x1b[0m: %s%s' %
                     (_BEGINNING_OF_LINE, _CLEAR_LINE, '\n%s' %
                      explanatory_text if explanatory_text is not None else '',
                      _COLORS[_TAG_COLOR[tag]][1], _COLORS[_TAG_COLOR[tag]][0],
                      tag, msg, '\n'
                      if do_newline or explanatory_text is not None else ''))
             sys.stdout.flush()
             return
         except IOError as e:
             if e.errno != errno.EINTR:
                 raise
  
  
 message.old_tag = ''
 message.old_msg = ''
  
  
 def which(filename):
     if '/' in filename:
         return filename
     for path in os.environ['PATH'].split(os.pathsep):
         if os.path.exists(os.path.join(path, filename)):
             return os.path.join(path, filename)
     raise Exception('%s not found' % filename)
  
  
 class JobSpec(object):
     """Specifies what to run for a job."""
  
     def __init__(self,
                  cmdline,
                  shortname=None,
                  environ=None,
                  cwd=None,
                  shell=False,
                  timeout_seconds=5 * 60,
                  flake_retries=0,
                  timeout_retries=0,
                  kill_handler=None,
                  cpu_cost=1.0,
                  verbose_success=False,
                  logfilename=None):
         """
     Arguments:
       cmdline: a list of arguments to pass as the command line
       environ: a dictionary of environment variables to set in the child process
       kill_handler: a handler that will be called whenever job.kill() is invoked
       cpu_cost: number of cores per second this job needs
       logfilename: use given file to store job's output, rather than using a temporary file
     """
         if environ is None:
             environ = {}
         self.cmdline = cmdline
         self.environ = environ
         self.shortname = cmdline[0] if shortname is None else shortname
         self.cwd = cwd
         self.shell = shell
         self.timeout_seconds = timeout_seconds
         self.flake_retries = flake_retries
         self.timeout_retries = timeout_retries
         self.kill_handler = kill_handler
         self.cpu_cost = cpu_cost
         self.verbose_success = verbose_success
         self.logfilename = logfilename
         if self.logfilename and self.flake_retries != 0 and self.timeout_retries != 0:
             # Forbidden to avoid overwriting the test log when retrying.
             raise Exception(
                 'Cannot use custom logfile when retries are enabled')
  
     def identity(self):
         return '%r %r' % (self.cmdline, self.environ)
  
     def __hash__(self):
         return hash(self.identity())
  
     def __cmp__(self, other):
         return self.identity() == other.identity()
  
     def __lt__(self, other):
         return self.identity() < other.identity()
  
     def __repr__(self):
         return 'JobSpec(shortname=%s, cmdline=%s)' % (self.shortname,
                                                       self.cmdline)
  
     def __str__(self):
         return '%s: %s %s' % (self.shortname, ' '.join(
             '%s=%s' % kv for kv in list(self.environ.items())), ' '.join(
                 self.cmdline))
  
  
 class JobResult(object):
  
     def __init__(self):
         self.state = 'UNKNOWN'
         self.returncode = -1
         self.elapsed_time = 0
         self.num_failures = 0
         self.retries = 0
         self.message = ''
         self.cpu_estimated = 1
         self.cpu_measured = 1
  
  
 def read_from_start(f):
     f.seek(0)
     return f.read()
  
  
 class Job(object):
     """Manages one job."""
  
     def __init__(self,
                  spec,
                  newline_on_success,
                  travis,
                  add_env,
                  quiet_success=False):
         self._spec = spec
         self._newline_on_success = newline_on_success
         self._travis = travis
         self._add_env = add_env.copy()
         self._retries = 0
         self._timeout_retries = 0
         self._suppress_failure_message = False
         self._quiet_success = quiet_success
         if not self._quiet_success:
             message('START', spec.shortname, do_newline=self._travis)
         self.result = JobResult()
         self.start()
  
     def GetSpec(self):
         return self._spec
  
     def start(self):
         if self._spec.logfilename:
             # make sure the log directory exists
             logfile_dir = os.path.dirname(
                 os.path.abspath(self._spec.logfilename))
             if not os.path.exists(logfile_dir):
                 os.makedirs(logfile_dir)
             self._logfile = open(self._spec.logfilename, 'w+')
         else:
             # macOS: a series of quick os.unlink invocation might cause OS
             # error during the creation of temporary file. By using
             # NamedTemporaryFile, we defer the removal of file and directory.
             self._logfile = tempfile.NamedTemporaryFile()
         env = dict(os.environ)
         env.update(self._spec.environ)
         env.update(self._add_env)
         env = sanitized_environment(env)
         self._start = time.time()
         cmdline = self._spec.cmdline
         # The Unix time command is finicky when used with MSBuild, so we don't use it
         # with jobs that run MSBuild.
         global measure_cpu_costs
         if measure_cpu_costs and not 'vsprojects\\build' in cmdline[0]:
             cmdline = ['time', '-p'] + cmdline
         else:
             measure_cpu_costs = False
         try_start = lambda: subprocess.Popen(args=cmdline,
                                              stderr=subprocess.STDOUT,
                                              stdout=self._logfile,
                                              cwd=self._spec.cwd,
                                              shell=self._spec.shell,
                                              env=env)
         delay = 0.3
         for i in range(0, 4):
             try:
                 self._process = try_start()
                 break
             except OSError:
                 message(
                     'WARNING', 'Failed to start %s, retrying in %f seconds' %
                     (self._spec.shortname, delay))
                 time.sleep(delay)
                 delay *= 2
         else:
             self._process = try_start()
         self._state = _RUNNING
  
     def state(self):
         """Poll current state of the job. Prints messages at completion."""
  
         def stdout(self=self):
             stdout = read_from_start(self._logfile)
             self.result.message = stdout[-_MAX_RESULT_SIZE:]
             return stdout
  
         if self._state == _RUNNING and self._process.poll() is not None:
             elapsed = time.time() - self._start
             self.result.elapsed_time = elapsed
             if self._process.returncode != 0:
                 if self._retries < self._spec.flake_retries:
                     message('FLAKE',
                             '%s [ret=%d, pid=%d]' %
                             (self._spec.shortname, self._process.returncode,
                              self._process.pid),
                             stdout(),
                             do_newline=True)
                     self._retries += 1
                     self.result.num_failures += 1
                     self.result.retries = self._timeout_retries + self._retries
                     # NOTE: job is restarted regardless of jobset's max_time setting
                     self.start()
                 else:
                     self._state = _FAILURE
                     if not self._suppress_failure_message:
                         message('FAILED',
                                 '%s [ret=%d, pid=%d, time=%.1fsec]' %
                                 (self._spec.shortname, self._process.returncode,
                                  self._process.pid, elapsed),
                                 stdout(),
                                 do_newline=True)
                     self.result.state = 'FAILED'
                     self.result.num_failures += 1
                     self.result.returncode = self._process.returncode
             else:
                 self._state = _SUCCESS
                 measurement = ''
                 if measure_cpu_costs:
                     m = re.search(
                         r'real\s+([0-9.]+)\nuser\s+([0-9.]+)\nsys\s+([0-9.]+)',
                         (stdout()).decode('utf8', errors='replace'))
                     real = float(m.group(1))
                     user = float(m.group(2))
                     sys = float(m.group(3))
                     if real > 0.5:
                         cores = (user + sys) / real
                         self.result.cpu_measured = float('%.01f' % cores)
                         self.result.cpu_estimated = float('%.01f' %
                                                           self._spec.cpu_cost)
                         measurement = '; cpu_cost=%.01f; estimated=%.01f' % (
                             self.result.cpu_measured, self.result.cpu_estimated)
                 if not self._quiet_success:
                     message('PASSED',
                             '%s [time=%.1fsec, retries=%d:%d%s]' %
                             (self._spec.shortname, elapsed, self._retries,
                              self._timeout_retries, measurement),
                             stdout() if self._spec.verbose_success else None,
                             do_newline=self._newline_on_success or self._travis)
                 self.result.state = 'PASSED'
         elif (self._state == _RUNNING and
               self._spec.timeout_seconds is not None and
               time.time() - self._start > self._spec.timeout_seconds):
             elapsed = time.time() - self._start
             self.result.elapsed_time = elapsed
             if self._timeout_retries < self._spec.timeout_retries:
                 message('TIMEOUT_FLAKE',
                         '%s [pid=%d]' %
                         (self._spec.shortname, self._process.pid),
                         stdout(),
                         do_newline=True)
                 self._timeout_retries += 1
                 self.result.num_failures += 1
                 self.result.retries = self._timeout_retries + self._retries
                 if self._spec.kill_handler:
                     self._spec.kill_handler(self)
                 self._process.terminate()
                 # NOTE: job is restarted regardless of jobset's max_time setting
                 self.start()
             else:
                 message('TIMEOUT',
                         '%s [pid=%d, time=%.1fsec]' %
                         (self._spec.shortname, self._process.pid, elapsed),
                         stdout(),
                         do_newline=True)
                 self.kill()
                 self.result.state = 'TIMEOUT'
                 self.result.num_failures += 1
         return self._state
  
     def kill(self):
         if self._state == _RUNNING:
             self._state = _KILLED
             if self._spec.kill_handler:
                 self._spec.kill_handler(self)
             self._process.terminate()
  
     def suppress_failure_message(self):
         self._suppress_failure_message = True
  
  
 class Jobset(object):
     """Manages one run of jobs."""
  
     def __init__(self, check_cancelled, maxjobs, maxjobs_cpu_agnostic,
                  newline_on_success, travis, stop_on_failure, add_env,
                  quiet_success, max_time):
         self._running = set()
         self._check_cancelled = check_cancelled
         self._cancelled = False
         self._failures = 0
         self._completed = 0
         self._maxjobs = maxjobs
         self._maxjobs_cpu_agnostic = maxjobs_cpu_agnostic
         self._newline_on_success = newline_on_success
         self._travis = travis
         self._stop_on_failure = stop_on_failure
         self._add_env = add_env
         self._quiet_success = quiet_success
         self._max_time = max_time
         self.resultset = {}
         self._remaining = None
         self._start_time = time.time()
  
     def set_remaining(self, remaining):
         self._remaining = remaining
  
     def get_num_failures(self):
         return self._failures
  
     def cpu_cost(self):
         c = 0
         for job in self._running:
             c += job._spec.cpu_cost
         return c
  
     def start(self, spec):
         """Start a job. Return True on success, False on failure."""
         while True:
             if self._max_time > 0 and time.time(
             ) - self._start_time > self._max_time:
                 skipped_job_result = JobResult()
                 skipped_job_result.state = 'SKIPPED'
                 message('SKIPPED', spec.shortname, do_newline=True)
                 self.resultset[spec.shortname] = [skipped_job_result]
                 return True
             if self.cancelled():
                 return False
             current_cpu_cost = self.cpu_cost()
             if current_cpu_cost == 0:
                 break
             if current_cpu_cost + spec.cpu_cost <= self._maxjobs:
                 if len(self._running) < self._maxjobs_cpu_agnostic:
                     break
             self.reap(spec.shortname, spec.cpu_cost)
         if self.cancelled():
             return False
         job = Job(spec, self._newline_on_success, self._travis, self._add_env,
                   self._quiet_success)
         self._running.add(job)
         if job.GetSpec().shortname not in self.resultset:
             self.resultset[job.GetSpec().shortname] = []
         return True
  
     def reap(self, waiting_for=None, waiting_for_cost=None):
         """Collect the dead jobs."""
         while self._running:
             dead = set()
             for job in self._running:
                 st = eintr_be_gone(lambda: job.state())
                 if st == _RUNNING:
                     continue
                 if st == _FAILURE or st == _KILLED:
                     self._failures += 1
                     if self._stop_on_failure:
                         self._cancelled = True
                         for job in self._running:
                             job.kill()
                 dead.add(job)
                 break
             for job in dead:
                 self._completed += 1
                 if not self._quiet_success or job.result.state != 'PASSED':
                     self.resultset[job.GetSpec().shortname].append(job.result)
                 self._running.remove(job)
             if dead:
                 return
             if not self._travis and platform_string() != 'windows':
                 rstr = '' if self._remaining is None else '%d queued, ' % self._remaining
                 if self._remaining is not None and self._completed > 0:
                     now = time.time()
                     sofar = now - self._start_time
                     remaining = sofar / self._completed * (self._remaining +
                                                            len(self._running))
                     rstr = 'ETA %.1f sec; %s' % (remaining, rstr)
                 if waiting_for is not None:
                     wstr = ' next: %s @ %.2f cpu' % (waiting_for,
                                                      waiting_for_cost)
                 else:
                     wstr = ''
                 message(
                     'WAITING',
                     '%s%d jobs running, %d complete, %d failed (load %.2f)%s' %
                     (rstr, len(self._running), self._completed, self._failures,
                      self.cpu_cost(), wstr))
             if platform_string() == 'windows':
                 time.sleep(0.1)
             else:
                 signal.alarm(10)
                 signal.pause()
  
     def cancelled(self):
         """Poll for cancellation."""
         if self._cancelled:
             return True
         if not self._check_cancelled():
             return False
         for job in self._running:
             job.kill()
         self._cancelled = True
         return True
  
     def finish(self):
         while self._running:
             if self.cancelled():
                 pass  # poll cancellation
             self.reap()
         if platform_string() != 'windows':
             signal.alarm(0)
         return not self.cancelled() and self._failures == 0
  
  
 def _never_cancelled():
     return False
  
  
 def tag_remaining(xs):
     staging = []
     for x in xs:
         staging.append(x)
         if len(staging) > 5000:
             yield (staging.pop(0), None)
     n = len(staging)
     for i, x in enumerate(staging):
         yield (x, n - i - 1)
  
  
 def run(cmdlines,
         check_cancelled=_never_cancelled,
         maxjobs=None,
         maxjobs_cpu_agnostic=None,
         newline_on_success=False,
         travis=False,
         infinite_runs=False,
         stop_on_failure=False,
         add_env={},
         skip_jobs=False,
         quiet_success=False,
         max_time=-1):
     if skip_jobs:
         resultset = {}
         skipped_job_result = JobResult()
         skipped_job_result.state = 'SKIPPED'
         for job in cmdlines:
             message('SKIPPED', job.shortname, do_newline=True)
             resultset[job.shortname] = [skipped_job_result]
         return 0, resultset
     js = Jobset(
         check_cancelled, maxjobs if maxjobs is not None else _DEFAULT_MAX_JOBS,
         maxjobs_cpu_agnostic if maxjobs_cpu_agnostic is not None else
         _DEFAULT_MAX_JOBS, newline_on_success, travis, stop_on_failure, add_env,
         quiet_success, max_time)
     for cmdline, remaining in tag_remaining(cmdlines):
         if not js.start(cmdline):
             break
         if remaining is not None:
             js.set_remaining(remaining)
     js.finish()
     return js.get_num_failures(), js.resultset