简体   繁体   中英

tf.GPUOptions not applying with set_session() in Keras

I am trying to incriment the per_process_gpu_memory_fraction value in my tf.GPUOptions() and then change the Keras session with set_session() however, the memory fraction never actually changes. After the first run of the while loop, 319MB is reserved as shown in nvidia-smi , which

a) never gets released when clear_session() is called, and

b) doesn't go up on the next iteration of the while loop.

import GPUtil
import time

import tensorflow as tf
import numpy as np

from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical


def model_trainer():
    y_pred = None
    errors = 0
    total_ram = GPUtil.getGPUs()[0].memoryTotal
    total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
    mem_amount = 0.005 # intentionally allocated a small amount so it needs to
                       # increment the mem_amount

    x_train = np.empty((10000, 100))
    y_train = np.random.randint(0, 9, size=10000)
    y_train = to_categorical(y_train, 10)

    while y_pred is None:
        print("mem", mem_amount)
        if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
            config = tf.ConfigProto(
                intra_op_parallelism_threads=2,
                inter_op_parallelism_threads=2,
                gpu_options=gpu_options)

            sess = tf.Session(config=config)
            set_session(sess)
            model = Sequential()
            model.add(Dense(units=64, activation='relu', input_dim=100))
            model.add(Dense(units=1024, activation='relu'))
            model.add(Dense(units=1024, activation='relu'))
            model.add(Dense(units=1024, activation='relu'))
            model.add(Dense(units=1024, activation='relu'))
            model.add(Dense(units=1024, activation='relu'))
            model.add(Dense(units=10, activation='softmax'))
            model.compile(loss='categorical_crossentropy',
                          optimizer='sgd',
                          metrics=['accuracy'])

            try:
                print(sess)

                model.fit(x_train, y_train, epochs=5, batch_size=32)
                y_pred = model.predict(x_train)

            except (ResourceExhaustedError, UnknownError) as e:
                if mem_amount > 1.0:
                    raise ValueError('model too large for vram')
                else:
                    mem_amount += 0.05

                clear_session()
                errors += 1
                pass
        else:
            clear_session()


if __name__ == "__main__":
    model_trainer()

The puzzling thing is that Keras willingly takes the new session (as shown by a get_session() call), but won't apply the new GPUOptions .

In addition to the example above I have tried doing:

clear_session()
del model
clear_session()
del model
gc.collect()

None of this has worked in releasing the VRAM.

My overall goal is to use "trial and error" until the process has enough VRAM to train on, as there seems to be no good way of figuring out how much VRAM is needed for a Keras model without just running it, so that I can run multiple models in parallel on a single GPU. When the ResourceExhaustedError occurs, I want to release the VRAM that is held by Keras and then try again with a larger amount of VRAM . Is there any way to accomplish this?

After searching for a while, I found that Tensorflow will only take VRAM, and will never release it until it dies, even if del model, clear_session() is used. I also tried the method displayed here ( https://github.com/keras-team/keras/issues/9379 ), which uses:

from keras import backend as K
K.clear_session()

from numba import cuda
cuda.select_device(0)
cuda.close()

This resulted in an error for me as when Tensorflow tried to access the GPU again, its pointer to the memory space was invalid (as it was killed with cuda.close()). Thus the only way around it is to use processes, and not threads (tried that too, same issue as before).

The other thing I found is that while there are methods to try to estimate the amount of VRAM a Keras model will use, it is not a very accurate way of doing it. (see: How to determine needed memory of Keras model? ) I also tried computing directly from the Keras layers and that varied wildly, so that wasn't accurate either. So that really only leaves you to do trial an error by catching the ResourceExhaustedError and trying again.

Below is my code for running multiple different Keras model on a single GPU.

import GPUtil
import time
import multiprocessing

import tensorflow as tf
import numpy as np

from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical


def model_trainer():
    mem_amount = 0.05

    x_train = np.empty((100000, 100))
    y_train = np.random.randint(0, 9, size=100000)
    y_train = to_categorical(y_train, 10)

    manager = multiprocessing.Manager()
    return_dict = manager.dict()

    def worker(mem_amount, return_dict):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
        config = tf.ConfigProto(
            intra_op_parallelism_threads=2,
            inter_op_parallelism_threads=2,
            gpu_options=gpu_options)
        sess = tf.Session(config=config)
        set_session(sess)

        model = Sequential()
        model.add(Dense(units=64, activation='relu', input_dim=100))
        model.add(Dense(units=1024, activation='relu'))
        model.add(Dense(units=1024, activation='relu'))
        model.add(Dense(units=2048, activation='relu'))
        model.add(Dense(units=10, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='sgd',
                      metrics=['accuracy'])

        try:
            get_session()

            model.fit(x_train, y_train, epochs=5, batch_size=1000)

            return_dict["valid"] = True

        except (ResourceExhaustedError, UnknownError) as e:
            return

    while "valid" not in list(return_dict.keys()):
        print("mem", mem_amount)

        total_ram = GPUtil.getGPUs()[0].memoryTotal
        total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90

        # can add in a for loop to have multiple models
        if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
            p = multiprocessing.Process(target=worker, args=(mem_amount, return_dict))
            p.start()
            p.join()

            print(return_dict.values())

            if "valid" not in list(return_dict.keys()):
                if mem_amount > 1.0:
                    raise ValueError('model too large for vram')
                else:
                    mem_amount += 0.05
            else:
                break
        else:
            time.sleep(10)


if __name__ == "__main__":
    model_trainer()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM