"""GPU utilities
See the source code on Github: `mltk/utils/gpu.py <https://github.com/siliconlabs/mltk/blob/master/mltk/utils/gpu.py>`_
"""
import os
import sys
import re
import atexit
import logging
from collections import namedtuple
from .path import get_user_setting
from .logger import DummyLogger
from .python import SHORT_VERSION
# See:
# https://www.tensorflow.org/install/source#gpu
TensorflowCudaVersions = namedtuple('TensorflowCudaVersions', ['tensorflow', 'cudnn', 'cuda', 'python_min', 'python_max'])
TENSORFLOW_CUDA_COMPATIBILITY = [
# This should be in DESCENDING order of the Tensorflow version
TensorflowCudaVersions('2.8', '8.1', '11.2', '3.7', '3.10'),
TensorflowCudaVersions('2.7', '8.1', '11.2', '3.7', '3.9'),
TensorflowCudaVersions('2.6', '8.1', '11.2', '3.7', '3.9'),
TensorflowCudaVersions('2.5', '8.1', '11.2', '3.7', '3.9'),
TensorflowCudaVersions('2.4', '8.0', '11.0', '3.7', '3.8'),
TensorflowCudaVersions('2.3', '7.6', '10.1', '3.7', '3.8'),
]
[docs]def disable():
"""Disable the GPU from being used by Tensorflow"""
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
[docs]def initialize(logger=None):
"""Initialize the GPU for usage with Tensorflow
NOTE: The deinitialize() API will automatically be called when the script exits
"""
selected_gpus = globals().get('selected_gpus', [])
if selected_gpus:
return
globals()['selected_gpus'] = []
CUDA_VISIBLE_DEVICES = get_user_setting(
'cuda_visible_devices',
os.getenv('MLTK_CUDA_VISIBLE_DEVICES',
os.getenv('CUDA_VISIBLE_DEVICES', '')
)
).strip()
logger.debug(f'CUDA_VISIBLE_DEVICES={CUDA_VISIBLE_DEVICES}')
if CUDA_VISIBLE_DEVICES == '-1':
return
logger = logger or DummyLogger()
try:
# %% Select available GPU
import GPUtil
import tensorflow as tf
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
gpus = GPUtil.getGPUs()
if len(gpus) == 0:
logger.info("No GPUs found, using CPU for training")
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
return
logger.debug(f"Available GPUs:\n" + "\n".join([f"- {g.name} (id={g.id})" for g in gpus]))
if not CUDA_VISIBLE_DEVICES:
logger.debug('Searching best GPU available')
best_gpu = gpus[0]
for gpu in gpus[1:]:
if gpu.memoryFree > best_gpu.memoryFree:
best_gpu = gpu
CUDA_VISIBLE_DEVICES = str(best_gpu.id)
elif CUDA_VISIBLE_DEVICES == 'all':
logger.debug('Using all available GPUs')
CUDA_VISIBLE_DEVICES = ','.join(str(x.id) for x in gpus)
os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
# Enable dynamic memory growth for GPU
try:
tf_gpus = tf.config.list_physical_devices('GPU')
if len(tf_gpus) == 0:
_print_warning_msg(logger)
deinitialize(force=True)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
return
gpu_ids = [int(x) for x in CUDA_VISIBLE_DEVICES.split(',')]
for gpu_id in gpu_ids:
globals()['selected_gpus'].append(gpu_id)
for tf_gpu in tf_gpus:
if tf_gpu.name.endswith(f'GPU:{gpu_id}'):
tf.config.experimental.set_memory_growth(tf_gpu, True)
logger.info(f"Selecting GPU : {gpus[gpu_id].name} (id={gpu_id})")
except Exception as e:
logger.debug(f'Error configuring GPU(s), err: {e}')
# The TfLiteConverter adds a StreamHandler to the root logger,
# remove it so we don't double print everything to the console
logging.getLogger().handlers.clear()
atexit.register(deinitialize)
except Exception as e:
err_msg = f'{e}'
logger.warning(f'GPU init err: {err_msg}')
if 'Driver/library version mismatch' in err_msg:
_print_warning_msg(logger)
logger.info("Using CPU for training")
deinitialize()
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
[docs]def deinitialize(force=False):
"""De-initialize the GPU
NOTE: This is automatically called when the script exits
"""
selected_gpus = globals().get('selected_gpus', [])
try:
from numba import cuda
for gpu_id in selected_gpus:
cuda.select_device(gpu_id)
cuda.close()
except:
pass
finally:
selected_gpus.clear()
[docs]def get_tensorflow_version_with_cudnn_version(cudnn_ver:str) -> TensorflowCudaVersions:
toks = cudnn_ver.split('.')
cudnn_ver = '.'.join(toks[:2]) # strip of the "patch" part of the version string
# Find the largest Tensorflow version that it
# compatible with the given CuDNN version
for ver in TENSORFLOW_CUDA_COMPATIBILITY:
if ver.cudnn == cudnn_ver:
return ver
return None
[docs]def check_tensorflow_cuda_compatibility_error(log_file_path:str) -> str:
try:
with open(log_file_path, 'r') as f:
log_lines = f.read().splitlines()
except:
return None
required_tensorflow_version = None
invalid_gpu_driver = False
cuda_error_re = re.compile(r'.*Loaded runtime CuDNN library: (\d+\.\d+\.\d+) but source was compiled with: (\d+\.\d+\.\d+).*')
for line in log_lines:
match = cuda_error_re.match(line)
if match:
installed_cudnn_ver = match.group(1)
expected_cudnn_ver = match.group(2)
required_tensorflow_version = get_tensorflow_version_with_cudnn_version(installed_cudnn_ver)
elif 'DNN library is not found' in line:
invalid_gpu_driver = True
if not(required_tensorflow_version or invalid_gpu_driver):
return None
def _current_python_version_supported(ver:TensorflowCudaVersions) -> bool:
def _version_to_int(v):
toks = v.split('.')
return int(toks[0]) * 100 + int(toks[1])
current_python = _version_to_int(SHORT_VERSION)
python_min = _version_to_int(ver.python_min)
python_max = _version_to_int(ver.python_max)
return current_python >= python_min and current_python <= python_max
retval = 'There appears to be a compatibility issue with MLTK Python venv Tensorflow version and installed GPU driver.\n'
retval += 'For a compatibility list, see:\n'
retval += 'https://www.tensorflow.org/install/source#gpu\n\n'
retval += 'Recommended solutions:\n\n'
count = 1
if required_tensorflow_version is not None:
retval += f'{count}. Update the Tensorflow version in the MLTK Python venv to match the installed GPU driver by running:\n'
retval += f' {"pip" if os.name == "nt" else "pip3"} install tensorflow=={required_tensorflow_version.tensorflow}.*\n'
if not _current_python_version_supported(required_tensorflow_version):
retval += '\n'
retval += f' NOTE: Your current Python version: {SHORT_VERSION} is NOT supported by Tensorflow-{required_tensorflow_version.tensorflow}\n'
retval += ' To resolve this issue:\n'
retval += f' 1. Created a new Python virtual environment using a Python version {required_tensorflow_version.python_min}-{required_tensorflow_version.python_max}\n'
retval += ' 2. Install the MLTK Python package\n'
retval += f' 3. Install Tensorflow-{required_tensorflow_version.tensorflow}\n'
retval += '\n'
count += 1
retval += f'{count}. Update your GPU driver to match the installed Tensorflow version in the MLTK venv, see:\n'
retval += ' https://www.tensorflow.org/install/gpu\n'
retval += ' https://www.tensorflow.org/install/source#gpu\n\n'
count += 1
retval += f'{count}. Disable the GPU by defining the environment variable: CUDA_VISIBLE_DEVICES=-1, e.g.:\n'
if os.name == 'nt':
retval += ' set CUDA_VISIBLE_DEVICES=-1\n'
else:
retval += ' export CUDA_VISIBLE_DEVICES=-1\n'
return retval
def _print_warning_msg(logger):
logger.warning('\n\n\n'
'*******************************************************************************\n'
'WARNING: Failed to load GPU driver\n'
'\n'
'This could mean that the driver or CUDA libraries are not properly installed,\n'
'or that your installed GPU driver does not match the Tensorflow version.\n\n'
'Refer to the Tensorflow GPU installation guide here:\n'
'https://www.tensorflow.org/install/gpu\n'
'https://www.tensorflow.org/install/source#gpu\n'
'\n'
'Alternatively, you can disable the GPU by defining the environment variable: CUDA_VISIBLE_DEVICES=-1\n'
'.e.g.:\n'
f'{"set" if os.name == "nt" else "export"} CUDA_VISIBLE_DEVICES=-1\n\n'
'*******************************************************************************\n'
)