diff --git a/README.md b/README.md index 2869aaec..a72e8060 100644 --- a/README.md +++ b/README.md @@ -245,7 +245,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [- [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] - [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] + [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] @@ -334,11 +334,6 @@ ExLlamaV2: TensorRT-LLM: --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. -DeepSpeed: - --deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. - --nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading. - --local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups. - RoPE: --alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. --rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). diff --git a/docs/08 - Additional Tips.md b/docs/08 - Additional Tips.md index 079d1da0..e6e2b3c9 100644 --- a/docs/08 - Additional Tips.md +++ b/docs/08 - Additional Tips.md @@ -13,29 +13,6 @@ Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126 This file will be automatically detected the next time you start the web UI. -## DeepSpeed - -`DeepSpeed ZeRO-3` is an alternative offloading strategy for full-precision (16-bit) transformers models. - -With this, I have been able to load a 6b model (GPT-J 6B) with less than 6GB of VRAM. The speed of text generation is very decent and much better than what would be accomplished with `--auto-devices --gpu-memory 6`. - -As far as I know, DeepSpeed is only available for Linux at the moment. - -### How to use it - -1. Install DeepSpeed: - -``` -conda install -c conda-forge mpi4py mpich -pip install -U deepspeed -``` - -2. Start the web UI replacing `python` with `deepspeed --num_gpus=1` and adding the `--deepspeed` flag. Example: - -``` -deepspeed --num_gpus=1 server.py --deepspeed --chat --model gpt-j-6B -``` - ## Miscellaneous info ### You can train LoRAs in CPU mode diff --git a/modules/deepspeed_parameters.py b/modules/deepspeed_parameters.py deleted file mode 100644 index f170a385..00000000 --- a/modules/deepspeed_parameters.py +++ /dev/null @@ -1,74 +0,0 @@ -def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir): - ''' - DeepSpeed configuration - https://huggingface.co/docs/transformers/main_classes/deepspeed - ''' - - if nvme_offload_dir: - ds_config = { - "fp16": { - "enabled": not ds_bf16, - }, - "bf16": { - "enabled": ds_bf16, - }, - "zero_optimization": { - "stage": 3, - "offload_param": { - "device": "nvme", - "nvme_path": nvme_offload_dir, - "pin_memory": True, - "buffer_count": 5, - "buffer_size": 1e9, - "max_in_cpu": 1e9 - }, - "overlap_comm": True, - "reduce_bucket_size": "auto", - "contiguous_gradients": True, - "sub_group_size": 1e8, - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": "auto", - "stage3_max_reuse_distance": "auto", - }, - "aio": { - "block_size": 262144, - "queue_depth": 32, - "thread_count": 1, - "single_submit": False, - "overlap_events": True - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False - } - else: - ds_config = { - "fp16": { - "enabled": not ds_bf16, - }, - "bf16": { - "enabled": ds_bf16, - }, - "zero_optimization": { - "stage": 3, - "offload_param": { - "device": "cpu", - "pin_memory": True - }, - "overlap_comm": True, - "contiguous_gradients": True, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": "auto", - "stage3_max_reuse_distance": "auto", - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False - } - - return ds_config diff --git a/modules/shared.py b/modules/shared.py index e3ef3c91..c917377a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -144,12 +144,6 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', group = parser.add_argument_group('TensorRT-LLM') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') -# DeepSpeed -group = parser.add_argument_group('DeepSpeed') -group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') -group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.') -group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.') - # RoPE group = parser.add_argument_group('RoPE') group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.') diff --git a/modules/text_generation.py b/modules/text_generation.py index daaf08e7..845711ce 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -372,8 +372,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None, generate_params['negative_prompt_ids'] = encode(state['negative_prompt']) generate_params.update({'use_cache': not shared.args.no_cache}) - if shared.args.deepspeed: - generate_params.update({'synced_gpus': True}) # Encode the input input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state)) diff --git a/modules/torch_utils.py b/modules/torch_utils.py index 418520a8..ba906857 100644 --- a/modules/torch_utils.py +++ b/modules/torch_utils.py @@ -12,9 +12,6 @@ def get_device(): return shared.model.device elif torch.cuda.is_available(): return torch.device('cuda') - elif shared.args.deepspeed: - import deepspeed - return deepspeed.get_accelerator().current_device_name() elif torch.backends.mps.is_available(): return torch.device('mps') elif is_torch_xpu_available(): diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index f1af1299..d57020c6 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -1,4 +1,3 @@ -import os import pprint from pathlib import Path @@ -6,11 +5,7 @@ import torch import torch.nn.functional as F import transformers from accelerate import infer_auto_device_map, init_empty_weights -from accelerate.utils import ( - is_ccl_available, - is_npu_available, - is_xpu_available -) +from accelerate.utils import is_xpu_available from transformers import ( AutoConfig, AutoModel, @@ -28,31 +23,6 @@ from modules.torch_utils import get_device transformers.logging.set_verbosity_error() -local_rank = None -if shared.args.deepspeed: - import deepspeed - from transformers.integrations.deepspeed import ( - HfDeepSpeedConfig, - is_deepspeed_zero3_enabled - ) - - from modules.deepspeed_parameters import generate_ds_config - - # Distributed setup - local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0")) - world_size = int(os.getenv("WORLD_SIZE", "1")) - if is_xpu_available() and is_ccl_available(): - torch.xpu.set_device(local_rank) - deepspeed.init_distributed(backend="ccl") - elif is_npu_available(): - torch.npu.set_device(local_rank) - deepspeed.init_distributed(dist_backend="hccl") - else: - torch.cuda.set_device(local_rank) - deepspeed.init_distributed() - ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir) - dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration - class _StopEverythingStoppingCriteria(transformers.StoppingCriteria): def __init__(self): @@ -163,7 +133,6 @@ def load_model_HF(model_name): shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.disk, - shared.args.deepspeed, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, @@ -183,25 +152,6 @@ def load_model_HF(model_name): if device: model = model.to(device) - # DeepSpeed ZeRO-3 - elif shared.args.deepspeed: - model = LoaderClass.from_pretrained( - path_to_model, - torch_dtype=params['torch_dtype'], - trust_remote_code=params.get('trust_remote_code') - ) - - model = deepspeed.initialize( - model=model, - config_params=ds_config, - model_parameters=None, - optimizer=None, - lr_scheduler=None - )[0] - - model.module.eval() # Inference - logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}') - # Load with quantization and/or offloading else: if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):