--- ./aiserver-orig.py 2022-11-06 11:32:18.376952288 -0000
+++ ./aiserver.py 2022-11-15 10:29:05.366757893 -0000
@@ -391,7 +391,7 @@
newlinemode = "ns"
quiet = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page)
debug = False # If set to true, will send debug information to the client for display
- lazy_load = True # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
+ lazy_load = False # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
use_colab_tpu = os.environ.get("COLAB_TPU_ADDR", "") != "" or os.environ.get("TPU_NAME", "") != "" # Whether or not we're in a Colab TPU instance or Kaggle TPU instance and are going to use the TPU rather than the CPU
revision = None
standalone = False
@@ -2089,7 +2089,7 @@
vars.sampler_order = [6, 0, 1, 2, 3, 4, 5]
vars.newlinemode = "n"
vars.revision = None
- vars.lazy_load = True
+ vars.lazy_load = False
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False):
@@ -2525,11 +2525,11 @@
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
try:
- model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
+ model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
- model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
+ model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
try:
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
@@ -2542,11 +2542,11 @@
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
try:
- model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
+ model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
- model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
+ model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
else:
old_rebuild_tensor = torch._utils._rebuild_tensor
def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
@@ -2572,11 +2572,11 @@
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
try:
- model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
+ model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
- model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
+ model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem, load_in_8bit=True, device_map="auto")
torch._utils._rebuild_tensor = old_rebuild_tensor