Source code for mltk.utils.gpu

"""GPU utilities

See the source code on Github: `mltk/utils/gpu.py <https://github.com/siliconlabs/mltk/blob/master/mltk/utils/gpu.py>`_
"""

import os
import sys
import re
import atexit
import logging
from collections import namedtuple

from .path import get_user_setting

from .logger import DummyLogger
from .python import SHORT_VERSION


# See:
# https://www.tensorflow.org/install/source#gpu
TensorflowCudaVersions = namedtuple('TensorflowCudaVersions', ['tensorflow', 'cudnn', 'cuda', 'python_min', 'python_max'])
TENSORFLOW_CUDA_COMPATIBILITY = [
    # This should be in DESCENDING order of the Tensorflow version
    TensorflowCudaVersions('2.8', '8.1', '11.2', '3.7', '3.10'),
    TensorflowCudaVersions('2.7', '8.1', '11.2', '3.7', '3.9'),
    TensorflowCudaVersions('2.6', '8.1', '11.2', '3.7', '3.9'),
    TensorflowCudaVersions('2.5', '8.1', '11.2', '3.7', '3.9'),
    TensorflowCudaVersions('2.4', '8.0', '11.0', '3.7', '3.8'),
    TensorflowCudaVersions('2.3', '7.6', '10.1', '3.7', '3.8'),
]



[docs]def disable(): """Disable the GPU from being used by Tensorflow""" os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
[docs]def initialize(logger=None): """Initialize the GPU for usage with Tensorflow NOTE: The deinitialize() API will automatically be called when the script exits """ selected_gpus = globals().get('selected_gpus', []) if selected_gpus: return globals()['selected_gpus'] = [] CUDA_VISIBLE_DEVICES = get_user_setting( 'cuda_visible_devices', os.getenv('MLTK_CUDA_VISIBLE_DEVICES', os.getenv('CUDA_VISIBLE_DEVICES', '') ) ).strip() logger.debug(f'CUDA_VISIBLE_DEVICES={CUDA_VISIBLE_DEVICES}') if CUDA_VISIBLE_DEVICES == '-1': return logger = logger or DummyLogger() try: # %% Select available GPU import GPUtil import tensorflow as tf os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" gpus = GPUtil.getGPUs() if len(gpus) == 0: logger.info("No GPUs found, using CPU for training") os.environ['CUDA_VISIBLE_DEVICES'] = "-1" return logger.debug(f"Available GPUs:\n" + "\n".join([f"- {g.name} (id={g.id})" for g in gpus])) if not CUDA_VISIBLE_DEVICES: logger.debug('Searching best GPU available') best_gpu = gpus[0] for gpu in gpus[1:]: if gpu.memoryFree > best_gpu.memoryFree: best_gpu = gpu CUDA_VISIBLE_DEVICES = str(best_gpu.id) elif CUDA_VISIBLE_DEVICES == 'all': logger.debug('Using all available GPUs') CUDA_VISIBLE_DEVICES = ','.join(str(x.id) for x in gpus) os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES # Enable dynamic memory growth for GPU try: tf_gpus = tf.config.list_physical_devices('GPU') if len(tf_gpus) == 0: _print_warning_msg(logger) deinitialize(force=True) os.environ['CUDA_VISIBLE_DEVICES'] = '-1' return gpu_ids = [int(x) for x in CUDA_VISIBLE_DEVICES.split(',')] for gpu_id in gpu_ids: globals()['selected_gpus'].append(gpu_id) for tf_gpu in tf_gpus: if tf_gpu.name.endswith(f'GPU:{gpu_id}'): tf.config.experimental.set_memory_growth(tf_gpu, True) logger.info(f"Selecting GPU : {gpus[gpu_id].name} (id={gpu_id})") except Exception as e: logger.debug(f'Error configuring GPU(s), err: {e}') # The TfLiteConverter adds a StreamHandler to the root logger, # remove it so we don't double print everything to the console logging.getLogger().handlers.clear() atexit.register(deinitialize) except Exception as e: err_msg = f'{e}' logger.warning(f'GPU init err: {err_msg}') if 'Driver/library version mismatch' in err_msg: _print_warning_msg(logger) logger.info("Using CPU for training") deinitialize() os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
[docs]def deinitialize(force=False): """De-initialize the GPU NOTE: This is automatically called when the script exits """ selected_gpus = globals().get('selected_gpus', []) try: from numba import cuda for gpu_id in selected_gpus: cuda.select_device(gpu_id) cuda.close() except: pass finally: selected_gpus.clear()
[docs]def get_tensorflow_version_with_cudnn_version(cudnn_ver:str) -> TensorflowCudaVersions: toks = cudnn_ver.split('.') cudnn_ver = '.'.join(toks[:2]) # strip of the "patch" part of the version string # Find the largest Tensorflow version that it # compatible with the given CuDNN version for ver in TENSORFLOW_CUDA_COMPATIBILITY: if ver.cudnn == cudnn_ver: return ver return None
[docs]def check_tensorflow_cuda_compatibility_error(log_file_path:str) -> str: try: with open(log_file_path, 'r') as f: log_lines = f.read().splitlines() except: return None required_tensorflow_version = None invalid_gpu_driver = False cuda_error_re = re.compile(r'.*Loaded runtime CuDNN library: (\d+\.\d+\.\d+) but source was compiled with: (\d+\.\d+\.\d+).*') for line in log_lines: match = cuda_error_re.match(line) if match: installed_cudnn_ver = match.group(1) expected_cudnn_ver = match.group(2) required_tensorflow_version = get_tensorflow_version_with_cudnn_version(installed_cudnn_ver) elif 'DNN library is not found' in line: invalid_gpu_driver = True if not(required_tensorflow_version or invalid_gpu_driver): return None def _current_python_version_supported(ver:TensorflowCudaVersions) -> bool: def _version_to_int(v): toks = v.split('.') return int(toks[0]) * 100 + int(toks[1]) current_python = _version_to_int(SHORT_VERSION) python_min = _version_to_int(ver.python_min) python_max = _version_to_int(ver.python_max) return current_python >= python_min and current_python <= python_max retval = 'There appears to be a compatibility issue with MLTK Python venv Tensorflow version and installed GPU driver.\n' retval += 'For a compatibility list, see:\n' retval += 'https://www.tensorflow.org/install/source#gpu\n\n' retval += 'Recommended solutions:\n\n' count = 1 if required_tensorflow_version is not None: retval += f'{count}. Update the Tensorflow version in the MLTK Python venv to match the installed GPU driver by running:\n' retval += f' {"pip" if os.name == "nt" else "pip3"} install tensorflow=={required_tensorflow_version.tensorflow}.*\n' if not _current_python_version_supported(required_tensorflow_version): retval += '\n' retval += f' NOTE: Your current Python version: {SHORT_VERSION} is NOT supported by Tensorflow-{required_tensorflow_version.tensorflow}\n' retval += ' To resolve this issue:\n' retval += f' 1. Created a new Python virtual environment using a Python version {required_tensorflow_version.python_min}-{required_tensorflow_version.python_max}\n' retval += ' 2. Install the MLTK Python package\n' retval += f' 3. Install Tensorflow-{required_tensorflow_version.tensorflow}\n' retval += '\n' count += 1 retval += f'{count}. Update your GPU driver to match the installed Tensorflow version in the MLTK venv, see:\n' retval += ' https://www.tensorflow.org/install/gpu\n' retval += ' https://www.tensorflow.org/install/source#gpu\n\n' count += 1 retval += f'{count}. Disable the GPU by defining the environment variable: CUDA_VISIBLE_DEVICES=-1, e.g.:\n' if os.name == 'nt': retval += ' set CUDA_VISIBLE_DEVICES=-1\n' else: retval += ' export CUDA_VISIBLE_DEVICES=-1\n' return retval
def _print_warning_msg(logger): logger.warning('\n\n\n' '*******************************************************************************\n' 'WARNING: Failed to load GPU driver\n' '\n' 'This could mean that the driver or CUDA libraries are not properly installed,\n' 'or that your installed GPU driver does not match the Tensorflow version.\n\n' 'Refer to the Tensorflow GPU installation guide here:\n' 'https://www.tensorflow.org/install/gpu\n' 'https://www.tensorflow.org/install/source#gpu\n' '\n' 'Alternatively, you can disable the GPU by defining the environment variable: CUDA_VISIBLE_DEVICES=-1\n' '.e.g.:\n' f'{"set" if os.name == "nt" else "export"} CUDA_VISIBLE_DEVICES=-1\n\n' '*******************************************************************************\n' )