From e98d1086f53dc9c9baa4d17f08b9660244b67d0f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 1 Feb 2024 20:09:30 -0300 Subject: [PATCH 01/18] Bump llama-cpp-python to 0.2.38 (#5420) --- requirements.txt | 24 ++++++++++++------------ requirements_amd.txt | 12 ++++++------ requirements_amd_noavx2.txt | 8 ++++---- requirements_apple_intel.txt | 12 ++++++------ requirements_apple_silicon.txt | 16 ++++++++-------- requirements_cpu_only.txt | 8 ++++---- requirements_cpu_only_noavx2.txt | 8 ++++---- requirements_noavx2.txt | 24 ++++++++++++------------ 8 files changed, 56 insertions(+), 56 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5a41d28f..660116e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,22 +29,22 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # llama-cpp-python (CUDA, no tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # llama-cpp-python (CUDA, tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements_amd.txt b/requirements_amd.txt index 7013a9db..23a7da45 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -29,14 +29,14 @@ bitsandbytes==0.38.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # AMD wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.36+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.36+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.38+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.38+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index b2cad012..d5470974 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -29,10 +29,10 @@ bitsandbytes==0.38.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.38.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # AMD wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index a0df8b2a..82c7d5c7 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -29,9 +29,9 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # Mac wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 75b257d5..806d3c0b 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -29,11 +29,11 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # Mac wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.36-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.38-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index fea31012..b17aa874 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -29,7 +29,7 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 5d6fe581..bd0ffe59 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -29,7 +29,7 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 400cc2ed..fe660972 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -29,22 +29,22 @@ bitsandbytes==0.41.1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows" # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.36+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # llama-cpp-python (CUDA, no tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.36+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # llama-cpp-python (CUDA, tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.36+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" From b6077b02e45eb904aa6f9eca142d38b9ee6a46e8 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Sat, 3 Feb 2024 21:20:02 -0600 Subject: [PATCH 02/18] Quadratic sampling (#5403) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- docs/03 - Parameters Tab.md | 1 + extensions/openai/typing.py | 1 + modules/loaders.py | 3 +++ modules/presets.py | 1 + modules/sampler_hijack.py | 47 +++++++++++++++++++++++++++---------- modules/text_generation.py | 5 ++-- modules/ui.py | 1 + modules/ui_parameters.py | 1 + 8 files changed, 45 insertions(+), 15 deletions(-) diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md index 501cf510..affa9e73 100644 --- a/docs/03 - Parameters Tab.md +++ b/docs/03 - Parameters Tab.md @@ -55,6 +55,7 @@ For more information about the parameters, the [transformers documentation](http * **mirostat_tau**: No idea, see the paper for details. According to the Preset Arena, 8 is a good value. * **mirostat_eta**: No idea, see the paper for details. According to the Preset Arena, 0.1 is a good value. * **dynamic_temperature**: Activates Dynamic Temperature. This modifies temperature to range between "dynatemp_low" (minimum) and "dynatemp_high" (maximum), with an entropy-based scaling. The steepness of the curve is controlled by "dynatemp_exponent". +* **smoothing_factor**: Activates Quadratic Sampling. This takes precedence over regular temperature and dynamic temperature, and replaces those samplers. When `0 < smoothing_factor < 1`, the logits distribution becomes flatter. When `smoothing_factor > 1`, it becomes more peaked. * **temperature_last**: Makes temperature the last sampler instead of the first. With this, you can remove low probability tokens with a sampler like min_p and then use a high temperature to make the model creative without losing coherency. * **do_sample**: When unchecked, sampling is entirely disabled, and greedy decoding is used instead (the most likely token is always picked). * **Seed**: Set the Pytorch seed to this number. Note that some loaders do not use Pytorch (notably llama.cpp), and others are not deterministic (notably ExLlama v1 and v2). For these loaders, the seed has no effect. diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 9c4a04f0..3deb464f 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -12,6 +12,7 @@ class GenerationOptions(BaseModel): dynatemp_low: float = 1 dynatemp_high: float = 1 dynatemp_exponent: float = 1 + smoothing_factor: float = 0 top_k: int = 0 repetition_penalty: float = 1 repetition_penalty_range: int = 1024 diff --git a/modules/loaders.py b/modules/loaders.py index 2976a851..618d4502 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -159,6 +159,7 @@ def transformers_samplers(): 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', + 'smoothing_factor', 'top_p', 'min_p', 'top_k', @@ -233,6 +234,7 @@ loaders_samplers = { 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', + 'smoothing_factor', 'top_p', 'min_p', 'top_k', @@ -289,6 +291,7 @@ loaders_samplers = { 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', + 'smoothing_factor', 'top_p', 'min_p', 'top_k', diff --git a/modules/presets.py b/modules/presets.py index 5e686e34..966c706e 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -17,6 +17,7 @@ def default_preset(): 'dynatemp_low': 1, 'dynatemp_high': 1, 'dynatemp_exponent': 1, + 'smoothing_factor': 0, 'top_p': 1, 'min_p': 0, 'top_k': 0, diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py index e9d82d3c..59b90b02 100644 --- a/modules/sampler_hijack.py +++ b/modules/sampler_hijack.py @@ -15,8 +15,12 @@ from modules import shared global_scores = None -class TemperatureLogitsWarperWithDynatemp(LogitsWarper): - def __init__(self, temperature: float, dynamic_temperature: bool, dynatemp_low: float, dynatemp_high: float, dynatemp_exponent: float): +class ModifiedTemperatureLogitsWarper(LogitsWarper): + ''' + Based on the original Transformers temperature logits warper, this + adds support for dynamic temperature and quadratic sampling. + ''' + def __init__(self, temperature: float, dynamic_temperature: bool, dynatemp_low: float, dynatemp_high: float, dynatemp_exponent: float, smoothing_factor: float): if not isinstance(temperature, float) or not (temperature > 0): except_msg = ( f"`temperature` (={temperature}) has to be a strictly positive float, otherwise your next token " @@ -32,16 +36,27 @@ class TemperatureLogitsWarperWithDynatemp(LogitsWarper): self.dynatemp_low = dynatemp_low self.dynatemp_high = dynatemp_high self.dynatemp_exponent = dynatemp_exponent + self.smoothing_factor = smoothing_factor def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - # Regular temperature - if not self.dynamic_temperature: - scores = scores / self.temperature - return scores + # Quadratic sampling + if self.smoothing_factor > 0: + + # Compute the maximum logit value + max_logit = scores.max() + + # Apply the quadratic transformation + transformed_logits = -(self.smoothing_factor * (scores - max_logit)**2) + max_logit + + # No need to print the top 5 logits since this is not required + # print("Original top 5 logits: ", torch.topk(scores, 5)) + # print("New top 5 logits: ", torch.topk(transformed_logits, 5)) + + return transformed_logits # Dynamic temperature - else: + elif self.dynamic_temperature: min_temp = self.dynatemp_low max_temp = self.dynatemp_high exponent_val = self.dynatemp_exponent @@ -88,6 +103,11 @@ class TemperatureLogitsWarperWithDynatemp(LogitsWarper): return scores + # Regular temperature + else: + scores = scores / self.temperature + return scores + class MinPLogitsWarper(LogitsWarper): def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): @@ -286,7 +306,7 @@ def get_logits_warper_patch(self, generation_config): generation_config.temperature = float(generation_config.temperature) temperature = generation_config.temperature - if generation_config.dynamic_temperature: + if generation_config.dynamic_temperature or generation_config.smoothing_factor > 0: # Make sure TemperatureLogitsWarper will be created by temporarily # setting temperature to a value != 1. generation_config.temperature = 1.1 @@ -294,12 +314,13 @@ def get_logits_warper_patch(self, generation_config): warpers = self._get_logits_warper_old(generation_config) for i in range(len(warpers)): if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper': - warpers[i] = TemperatureLogitsWarperWithDynatemp( + warpers[i] = ModifiedTemperatureLogitsWarper( temperature, generation_config.dynamic_temperature, generation_config.dynatemp_low, generation_config.dynatemp_high, - generation_config.dynatemp_exponent + generation_config.dynatemp_exponent, + generation_config.smoothing_factor ) warpers_to_add = LogitsProcessorList() @@ -328,7 +349,7 @@ def get_logits_warper_patch(self, generation_config): if generation_config.temperature_last: temperature_idx = None for i in range(len(warpers)): - if warpers[i].__class__.__name__ in ['TemperatureLogitsWarper', 'TemperatureLogitsWarperWithDynatemp']: + if warpers[i].__class__.__name__ in ['TemperatureLogitsWarper', 'ModifiedTemperatureLogitsWarper']: temperature_idx = i break @@ -352,8 +373,7 @@ def get_logits_processor_patch(self, **kwargs): repetition_penalty_range = kwargs['generation_config'].repetition_penalty_range do_rep_pen_hijack = (repetition_penalty > 1) or (presence_penalty != 0) or (frequency_penalty != 0) if do_rep_pen_hijack: - # Make sure that a RepetitionPenaltyLogitsProcessor will be created - kwargs['generation_config'].repetition_penalty = 1.1 # must set to some value > 1 + kwargs['generation_config'].repetition_penalty = 1.1 # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created result = self._get_logits_processor_old(**kwargs) @@ -372,6 +392,7 @@ def generation_config_init_patch(self, **kwargs): self.dynatemp_low = kwargs.pop("dynatemp_low", 1) self.dynatemp_high = kwargs.pop("dynatemp_high", 1) self.dynatemp_exponent = kwargs.pop("dynatemp_exponent", 1) + self.smoothing_factor = kwargs.pop("smoothing_factor", 0.0) self.tfs = kwargs.pop("tfs", 1.0) self.top_a = kwargs.pop("top_a", 0.0) self.mirostat_mode = kwargs.pop("mirostat_mode", 0) diff --git a/modules/text_generation.py b/modules/text_generation.py index f4849840..2796bfe1 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -285,8 +285,9 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0): def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False): generate_params = {} - for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'num_beams', 'length_penalty', 'early_stopping']: - generate_params[k] = state[k] + for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'num_beams', 'length_penalty', 'early_stopping']: + if k in state: + generate_params[k] = state[k] if state['negative_prompt'] != '': generate_params['negative_prompt_ids'] = encode(state['negative_prompt']) diff --git a/modules/ui.py b/modules/ui.py index b639c4df..53a8fd14 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -120,6 +120,7 @@ def list_interface_input_elements(): 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', + 'smoothing_factor', 'top_p', 'min_p', 'top_k', diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 63a3743a..a81ed27a 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -49,6 +49,7 @@ def create_ui(default_preset): shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.') shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau') shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta') + shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Replaces temperature with Quadratic Sampling.') shared.gradio['dynamic_temperature'] = gr.Checkbox(value=generate_params['dynamic_temperature'], label='dynamic_temperature') shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_low'], step=0.01, label='dynatemp_low', visible=generate_params['dynamic_temperature']) shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_high'], step=0.01, label='dynatemp_high', visible=generate_params['dynamic_temperature']) From cde000d47801fa13c5a88f9e435da64132bd96bc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 01:15:51 -0300 Subject: [PATCH 03/18] Remove non-HF ExLlamaV2 loader (#5431) --- modules/LoRA.py | 8 +- modules/exllamav2.py | 149 ------------------------------------- modules/loaders.py | 33 -------- modules/logits.py | 12 +-- modules/models.py | 8 -- modules/models_settings.py | 2 + modules/shared.py | 14 ++-- modules/text_generation.py | 11 +-- modules/ui_model_menu.py | 1 - 9 files changed, 18 insertions(+), 220 deletions(-) delete mode 100644 modules/exllamav2.py diff --git a/modules/LoRA.py b/modules/LoRA.py index 2619815c..6be473e0 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -12,7 +12,7 @@ from modules.models import reload_model def add_lora_to_model(lora_names): if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ': add_lora_autogptq(lora_names) - elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader == ['ExLlamav2', 'ExLlamav2_HF']: + elif shared.model.__class__.__name__ in ['Exllamav2HF'] or shared.args.loader == ['ExLlamav2_HF']: add_lora_exllamav2(lora_names) else: add_lora_transformers(lora_names) @@ -39,11 +39,7 @@ def add_lora_exllamav2(lora_names): shared.model.loras = [] for lora_name in lora_names: lora_path = get_lora_path(lora_name) - if shared.model.__class__.__name__ == 'Exllamav2Model': - lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path)) - else: - lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path)) - + lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path)) shared.model.loras.append(lora) shared.lora_names = lora_names diff --git a/modules/exllamav2.py b/modules/exllamav2.py deleted file mode 100644 index 551ed498..00000000 --- a/modules/exllamav2.py +++ /dev/null @@ -1,149 +0,0 @@ -import traceback -from pathlib import Path - -import torch -from exllamav2 import ( - ExLlamaV2, - ExLlamaV2Cache, - ExLlamaV2Cache_8bit, - ExLlamaV2Config, - ExLlamaV2Tokenizer -) -from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator - -from modules import shared -from modules.logging_colors import logger -from modules.text_generation import get_max_prompt_length - -try: - import flash_attn -except ModuleNotFoundError: - logger.warning( - 'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage ' - 'to be a lot higher than it could be.\n' - 'Try installing flash-attention following the instructions here: ' - 'https://github.com/Dao-AILab/flash-attention#installation-and-features' - ) - pass -except Exception: - logger.warning('Failed to load flash-attention due to the following error:\n') - traceback.print_exc() - - -class Exllamav2Model: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path_to_model): - - path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) - - config = ExLlamaV2Config() - config.model_dir = str(path_to_model) - config.prepare() - - config.max_seq_len = shared.args.max_seq_len - config.scale_pos_emb = shared.args.compress_pos_emb - config.scale_alpha_value = shared.args.alpha_value - config.no_flash_attn = shared.args.no_flash_attn - config.num_experts_per_token = int(shared.args.num_experts_per_token) - - model = ExLlamaV2(config) - - split = None - if shared.args.gpu_split: - split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] - - model.load(split) - - tokenizer = ExLlamaV2Tokenizer(config) - if shared.args.cache_8bit: - cache = ExLlamaV2Cache_8bit(model) - else: - cache = ExLlamaV2Cache(model) - - generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) - - result = self() - result.model = model - result.cache = cache - result.tokenizer = tokenizer - result.generator = generator - result.loras = None - return result, result - - def encode(self, string, **kwargs): - return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True) - - def decode(self, ids, **kwargs): - if isinstance(ids, list): - ids = torch.tensor([ids]) - elif isinstance(ids, torch.Tensor) and ids.numel() == 1: - ids = ids.view(1, -1) - - return self.tokenizer.decode(ids, decode_special_tokens=True)[0] - - def get_logits(self, token_ids, **kwargs): - self.cache.current_seq_len = 0 - if token_ids.shape[-1] > 1: - self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras) - - return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, loras=self.loras, **kwargs).float().cpu() - - def generate_with_streaming(self, prompt, state): - settings = ExLlamaV2Sampler.Settings() - - settings.token_repetition_penalty = state['repetition_penalty'] - settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range'] - - settings.token_frequency_penalty = state['frequency_penalty'] - settings.token_presence_penalty = state['presence_penalty'] - - settings.temperature = state['temperature'] - settings.top_k = state['top_k'] - settings.top_p = state['top_p'] - settings.top_a = state['top_a'] - settings.min_p = state['min_p'] - settings.tfs = state['tfs'] - settings.typical = state['typical_p'] - - settings.temperature_last = state['temperature_last'] - - settings.mirostat = state['mirostat_mode'] == 2 - settings.mirostat_tau = state['mirostat_tau'] - settings.mirostat_eta = state['mirostat_eta'] - - if state['ban_eos_token']: - settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id]) - - if state['custom_token_bans']: - to_ban = [int(x) for x in state['custom_token_bans'].split(',')] - if len(to_ban) > 0: - settings.disallow_tokens(self.tokenizer, to_ban) - - ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True) - ids = ids[:, -get_max_prompt_length(state):] - - if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - ids.shape[-1] - else: - max_new_tokens = state['max_new_tokens'] - - self.generator.begin_stream(ids, settings, loras=self.loras) - - decoded_text = '' - for i in range(max_new_tokens): - chunk, eos, _ = self.generator.stream() - if eos or shared.stop_everything: - break - - decoded_text += chunk - yield decoded_text - - def generate(self, prompt, state): - output = '' - for output in self.generate_with_streaming(prompt, state): - pass - - return output diff --git a/modules/loaders.py b/modules/loaders.py index 618d4502..5b39c379 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -81,16 +81,6 @@ loaders_and_params = OrderedDict({ 'trust_remote_code', 'no_use_fast', ], - 'ExLlamav2': [ - 'gpu_split', - 'max_seq_len', - 'no_flash_attn', - 'num_experts_per_token', - 'cache_8bit', - 'alpha_value', - 'compress_pos_emb', - 'exllamav2_info', - ], 'AutoGPTQ': [ 'triton', 'no_inject_fused_attention', @@ -204,29 +194,6 @@ loaders_samplers = { 'AutoAWQ': transformers_samplers(), 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), - 'ExLlamav2': { - 'temperature', - 'temperature_last', - 'top_p', - 'min_p', - 'top_k', - 'typical_p', - 'tfs', - 'top_a', - 'repetition_penalty', - 'presence_penalty', - 'frequency_penalty', - 'repetition_penalty_range', - 'seed', - 'mirostat_mode', - 'mirostat_tau', - 'mirostat_eta', - 'ban_eos_token', - 'add_bos_token', - 'custom_token_bans', - 'skip_special_tokens', - 'auto_max_new_tokens', - }, 'ExLlamav2_HF': { 'temperature', 'temperature_last', diff --git a/modules/logits.py b/modules/logits.py index c630be88..c2cbd92e 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -13,11 +13,10 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return logger.error("No model is loaded! Select one in the Model tab.") return 'Error: No model is loaded1 Select one in the Model tab.', previous - is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel' if use_samplers: - if any([is_non_hf_exllamav2, is_non_hf_llamacpp]): + if is_non_hf_llamacpp: logger.error("Sampler hijacking is not supported non-Huggingface loaders.") # sampling is all done in c for exllama, so it is really hard to hijack # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods, @@ -31,13 +30,7 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return scores = sampler_hijack.global_scores[-1] else: - if is_non_hf_exllamav2: - if is_torch_xpu_available(): - tokens = shared.tokenizer.encode(prompt).to("xpu:0") - else: - tokens = shared.tokenizer.encode(prompt).cuda() - scores = shared.model.get_logits(tokens)[-1][-1] - elif is_non_hf_llamacpp: + if is_non_hf_llamacpp: tokens = shared.tokenizer.encode(prompt) scores = shared.model.get_logits(tokens)[-1][-1] else: @@ -45,6 +38,7 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0") else: tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda() + output = shared.model(input_ids=tokens) scores = output['logits'][-1][-1] diff --git a/modules/models.py b/modules/models.py index 6c38c3c7..ab0e762c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -65,7 +65,6 @@ def load_model(model_name, loader=None): 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, - 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, 'AutoAWQ': AutoAWQ_loader, @@ -376,13 +375,6 @@ def AutoGPTQ_loader(model_name): return modules.AutoGPTQ_loader.load_quantized(model_name) -def ExLlamav2_loader(model_name): - from modules.exllamav2 import Exllamav2Model - - model, tokenizer = Exllamav2Model.from_pretrained(model_name) - return model, tokenizer - - def ExLlamav2_HF_loader(model_name): from modules.exllamav2_hf import Exllamav2HF diff --git a/modules/models_settings.py b/modules/models_settings.py index 9acc7efa..3e1649aa 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -141,6 +141,8 @@ def get_model_metadata(model): if re.match(pat.lower(), model.lower()): for k in settings[pat]: model_settings[k] = settings[pat][k] + if k == 'loader' and settings[pat][k] == 'ExLlamav2': + model_settings[k] = 'ExLlamav2_HF' return model_settings diff --git a/modules/shared.py b/modules/shared.py index e4bdacaa..cc8b9e5d 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -88,7 +88,7 @@ group.add_argument('--chat-buttons', action='store_true', help='Show buttons on # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -130,11 +130,11 @@ group.add_argument('--logits_all', action='store_true', help='Needs to be set fo group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') -# ExLlama -group = parser.add_argument_group('ExLlama') +# ExLlamaV2 +group = parser.add_argument_group('ExLlamaV2') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') -group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') +group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') @@ -248,11 +248,7 @@ def fix_loader_name(name): return 'AutoGPTQ' elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']: return 'GPTQ-for-LLaMa' - elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: - return 'ExLlama' - elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']: - return 'ExLlamav2' - elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: + elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2', 'exllama', 'ex-llama', 'ex_llama', 'exlama', 'exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']: return 'ctransformers' diff --git a/modules/text_generation.py b/modules/text_generation.py index 2796bfe1..198b7575 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -45,7 +45,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -120,10 +120,11 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']: input_ids = shared.tokenizer.encode(str(prompt)) - if shared.model.__class__.__name__ not in ['Exllamav2Model']: - input_ids = np.array(input_ids).reshape(1, len(input_ids)) + # The step below is necessary for llama.cpp, but may not be + # necessary for future loaders. + input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) if not add_bos_token: @@ -134,7 +135,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 441092a3..ebed4aa0 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -135,7 +135,6 @@ def create_ui(): shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') - shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.') with gr.Column(): From 4e188eeb80a1ca537523d46958f3f1deef48b2d2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 3 Feb 2024 20:39:50 -0800 Subject: [PATCH 04/18] Lint --- modules/LoRA.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 6be473e0..0cb1671e 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -12,7 +12,7 @@ from modules.models import reload_model def add_lora_to_model(lora_names): if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ': add_lora_autogptq(lora_names) - elif shared.model.__class__.__name__ in ['Exllamav2HF'] or shared.args.loader == ['ExLlamav2_HF']: + elif shared.model.__class__.__name__ == 'Exllamav2HF' or shared.args.loader == 'ExLlamav2_HF': add_lora_exllamav2(lora_names) else: add_lora_transformers(lora_names) From 3df7e151f7c3a7e02a2831fe1fcd7cd15881bc91 Mon Sep 17 00:00:00 2001 From: Badis Ghoubali <110173477+BadisG@users.noreply.github.com> Date: Sun, 4 Feb 2024 22:15:30 +0100 Subject: [PATCH 05/18] fix the n_batch slider (#5436) --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index ebed4aa0..12da92d4 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -90,7 +90,7 @@ def create_ui(): shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.') shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch) - shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch) + shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch) shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") From 2a45620c851e6b4244697b1901de523392b7b6a5 Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Mon, 5 Feb 2024 02:36:40 +0000 Subject: [PATCH 06/18] Split by rows instead of layers for llama.cpp multi-gpu (#5435) --- modules/llamacpp_hf.py | 3 ++- modules/llamacpp_model.py | 3 ++- modules/loaders.py | 2 ++ modules/shared.py | 1 + modules/ui.py | 1 + modules/ui_model_menu.py | 1 + 6 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index d491c463..4726669b 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -216,7 +216,8 @@ class LlamacppHF(PreTrainedModel): 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'logits_all': shared.args.logits_all, - 'offload_kqv': not shared.args.no_offload_kqv + 'offload_kqv': not shared.args.no_offload_kqv, + 'split_mode': 1 if not shared.args.row_split else 2 } Llama = llama_cpp_lib().Llama diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 96ea98e9..7c405a4b 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -95,7 +95,8 @@ class LlamaCppModel: 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, - 'offload_kqv': not shared.args.no_offload_kqv + 'offload_kqv': not shared.args.no_offload_kqv, + 'split_mode': 1 if not shared.args.row_split else 2 } result.model = Llama(**params) diff --git a/modules/loaders.py b/modules/loaders.py index 5b39c379..a0104e90 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -44,6 +44,7 @@ loaders_and_params = OrderedDict({ 'cpu', 'numa', 'no_offload_kqv', + 'row_split', 'tensorcores', ], 'llamacpp_HF': [ @@ -66,6 +67,7 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'logits_all', 'no_offload_kqv', + 'row_split', 'tensorcores', 'llamacpp_HF_info', ], diff --git a/modules/shared.py b/modules/shared.py index cc8b9e5d..38d08349 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -129,6 +129,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') +group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.') # ExLlamaV2 group = parser.add_argument_group('ExLlamaV2') diff --git a/modules/ui.py b/modules/ui.py index 53a8fd14..acd959a0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -93,6 +93,7 @@ def list_model_elements(): 'numa', 'logits_all', 'no_offload_kqv', + 'row_split', 'tensorcores', 'hqq_backend', ] diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 12da92d4..f03d45c9 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -107,6 +107,7 @@ def create_ui(): with gr.Column(): shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.') shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') + shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split model by rows across GPUs. Improves performance on some cards.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') From 9fdee65cf50923be9cf36b50bc6a7140c7f2f2d0 Mon Sep 17 00:00:00 2001 From: Badis Ghoubali <110173477+BadisG@users.noreply.github.com> Date: Mon, 5 Feb 2024 03:39:15 +0100 Subject: [PATCH 07/18] Improve ChatML template (#5411) --- instruction-templates/ChatML.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/instruction-templates/ChatML.yaml b/instruction-templates/ChatML.yaml index e9f2883f..8b55f0dc 100644 --- a/instruction-templates/ChatML.yaml +++ b/instruction-templates/ChatML.yaml @@ -5,15 +5,12 @@ instruction_template: |- {%- set ns.found = true -%} {%- endif -%} {%- endfor -%} - {%- if not ns.found -%} - {{- '<|im_start|>system\n' + '' + '<|im_end|>\n' -}} - {%- endif %} {%- for message in messages %} {%- if message['role'] == 'system' -%} - {{- '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' -}} + {{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}} {%- else -%} {%- if message['role'] == 'user' -%} - {{-'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'-}} + {{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}} {%- else -%} {{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}} {%- endif -%} From a210999255d66db0763627d21ee0e6f3223a9ce7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 18:40:25 -0800 Subject: [PATCH 08/18] Bump safetensors version --- requirements.txt | 2 +- requirements_amd.txt | 2 +- requirements_amd_noavx2.txt | 2 +- requirements_apple_intel.txt | 2 +- requirements_apple_silicon.txt | 2 +- requirements_cpu_only.txt | 2 +- requirements_cpu_only_noavx2.txt | 2 +- requirements_noavx2.txt | 2 +- requirements_nowheels.txt | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 660116e4..e91ffda6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_amd.txt b/requirements_amd.txt index 23a7da45..481b3150 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index d5470974..09ca140e 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index 82c7d5c7..f360cc0e 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 806d3c0b..81ab0575 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index b17aa874..58647cd8 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index bd0ffe59..29c5a564 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index fe660972..fc2795cb 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 218f3ddc..e5496c84 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -16,7 +16,7 @@ Pillow>=9.5.0 pyyaml requests rich -safetensors==0.4.1 +safetensors==0.4.* scipy sentencepiece tensorboard From cd4ffd3dd497a79fabd776b937ba056c2980d295 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 18:48:04 -0800 Subject: [PATCH 09/18] Update docs --- docs/04 - Model Tab.md | 10 +++------- docs/What Works.md | 15 ++++++++------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 2f1e07e6..2585f544 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -42,22 +42,18 @@ Examples: * https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ * **gpu-split**: If you have multiple GPUs, the amount of memory to allocate per GPU should be set in this field. Make sure to set a lower value for the first GPU, as that's where the cache is allocated. -* **max_seq_len**: The maximum sequence length for the model. In ExLlama, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "max_seq_len" so that you don't have to set the same thing twice. +* **max_seq_len**: The maximum sequence length for the model. In ExLlamaV2, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "max_seq_len" so that you don't have to set the same thing twice. * **cfg-cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage. * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). -### ExLlamav2 - -The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library. - ### AutoGPTQ Loads: GPTQ models. * **wbits**: For ancient models without proper metadata, sets the model precision in bits manually. Can usually be ignored. * **groupsize**: For ancient models without proper metadata, sets the model group size manually. Can usually be ignored. -* **triton**: Only available on Linux. Necessary to use models with both act-order and groupsize simultaneously. Note that ExLlama can load these same models on Windows without triton. +* **triton**: Only available on Linux. Necessary to use models with both act-order and groupsize simultaneously. Note that ExLlamaV2 can load these same models on Windows without triton. * **no_inject_fused_attention**: Improves performance while increasing the VRAM usage. * **no_inject_fused_mlp**: Similar to the previous parameter but for Triton only. * **no_use_cuda_fp16**: On some systems, the performance can be very bad with this unset. Can usually be ignored. @@ -67,7 +63,7 @@ Loads: GPTQ models. Loads: GPTQ models. -Ancient loader, the first one to implement 4-bit quantization. It works on older GPUs for which ExLlama and AutoGPTQ do not work, and it doesn't work with "act-order", so you should use it with simple 4-bit-128g models. +Ancient loader, the first one to implement 4-bit quantization. It works on older GPUs for which ExLlamaV2 and AutoGPTQ do not work, and it doesn't work with "act-order", so you should use it with simple 4-bit-128g models. * **pre_layer**: Used for CPU offloading. The higher the number, the more layers will be sent to the GPU. GPTQ-for-LLaMa CPU offloading was faster than the one implemented in AutoGPTQ the last time I checked. diff --git a/docs/What Works.md b/docs/What Works.md index 4f5defab..343343a1 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -2,15 +2,16 @@ | Loader | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation | |----------------|----------------|-------------------------|----------------|----------------------|-----------------------| -| Transformers | ✅ | ✅*** | ✅* | ✅ | ✅ | -| ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | -| ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | -| AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | -| GPTQ-for-LLaMa | ✅** | ✅*** | ✅ | ✅ | ✅ | -| llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | +| Transformers | ✅ | ✅\*\*\* | ✅\* | ✅ | ✅ | +| llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | | llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ | +| ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | +| AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | +| AutoAWQ | ? | ❌ | ? | ? | ✅ | +| GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ | | ctransformers | ❌ | ❌ | ❌ | ❌ | ❌ | -| AutoAWQ | ? | ❌ | ? | ? | ✅ | +| QuIP# | ? | ? | ? | ? | ✅ | +| HQQ | ? | ? | ? | ? | ✅ | ❌ = not implemented From 9033fa5eeef7132b7dd1470b5c450b90483499d1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:13:34 -0800 Subject: [PATCH 10/18] Organize the Model tab --- modules/loaders.py | 2 +- modules/ui_model_menu.py | 68 ++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index a0104e90..6107dac7 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -26,7 +26,7 @@ loaders_and_params = OrderedDict({ 'compress_pos_emb', 'disable_exllama', 'disable_exllamav2', - 'transformers_info' + 'transformers_info', ], 'llama.cpp': [ 'n_ctx', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index f03d45c9..98868f2c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -77,63 +77,69 @@ def create_ui(): with gr.Box(): with gr.Row(): with gr.Column(): - for i in range(len(total_mem)): - shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i]) + with gr.Blocks(): + for i in range(len(total_mem)): + shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i]) + + shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem) + + with gr.Blocks(): + shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:') + shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype) + shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type) - shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem) - shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:') - shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype) - shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers) shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.') + shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17') + shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch) shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch) - shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch) - shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) - shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) - shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) - shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) - shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) + with gr.Blocks(): + shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) + shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) + shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) + + shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') with gr.Column(): - shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.') + shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) + shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) + shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) + shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') + shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) + shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.') + shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') + shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') - shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split model by rows across GPUs. Improves performance on some cards.') + shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') - shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') - shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu) - shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) - shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) - shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) - shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) - shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) - shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') - shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) - shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.') - shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') - shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') - shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') - shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel.') - shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') + shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') - shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') + shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') + shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + with gr.Blocks(): + shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) + shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') + shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.') + + shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') + shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.') From 7073665a10194015c0529eca790199d41fcea9f9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 Feb 2024 02:31:24 -0300 Subject: [PATCH 11/18] Truncate long chat completions inputs (#5439) --- modules/chat.py | 47 +++++++++++++++++++++++++++++++++----- modules/text_generation.py | 13 +++++++---- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 5380f1ac..348c5eba 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -166,18 +166,53 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt = remove_extra_bos(prompt) return prompt - prompt = make_prompt(messages) - # Handle truncation max_length = get_max_prompt_length(state) - while len(messages) > 0 and get_encoded_length(prompt) > max_length: - # Try to save the system message - if len(messages) > 1 and messages[0]['role'] == 'system': + prompt = make_prompt(messages) + encoded_length = get_encoded_length(prompt) + + while len(messages) > 0 and encoded_length > max_length: + + # Remove old message, save system message + if len(messages) > 2 and messages[0]['role'] == 'system': messages.pop(1) - else: + + # Remove old message when no system message is present + elif len(messages) > 1 and messages[0]['role'] != 'system': messages.pop(0) + # Resort to truncating the user input + else: + + user_message = messages[-1]['content'] + + # Bisect the truncation point + left, right = 0, len(user_message) - 1 + + while right - left > 1: + mid = (left + right) // 2 + + messages[-1]['content'] = user_message[mid:] + prompt = make_prompt(messages) + encoded_length = get_encoded_length(prompt) + + if encoded_length <= max_length: + right = mid + else: + left = mid + + messages[-1]['content'] = user_message[right:] + prompt = make_prompt(messages) + encoded_length = get_encoded_length(prompt) + if encoded_length > max_length: + logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n") + raise ValueError + else: + logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}.") + break + prompt = make_prompt(messages) + encoded_length = get_encoded_length(prompt) if also_return_rows: return prompt, [message['content'] for message in messages] diff --git a/modules/text_generation.py b/modules/text_generation.py index 198b7575..04625ab9 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -50,6 +50,11 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap else: generate_func = generate_reply_HF + if generate_func != generate_reply_HF and shared.args.verbose: + logger.info("PROMPT=") + print(question) + print() + # Prepare the input original_question = question if not is_chat: @@ -65,10 +70,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap if type(st) is list and len(st) > 0: all_stop_strings += st - if shared.args.verbose: - logger.info("PROMPT=") - print(question) - shared.stop_everything = False clear_torch_cache() seed = set_manual_seed(state['seed']) @@ -355,6 +356,10 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(filtered_params) print() + logger.info("PROMPT=") + print(decode(input_ids[0], skip_special_tokens=False)) + print() + t0 = time.time() try: if not is_chat and not shared.is_seq2seq: From f234fbe83fee4fdc025365b3b8215329032febfc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 21:44:36 -0800 Subject: [PATCH 12/18] Improve a log message after previous commit --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 348c5eba..bddc3132 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -208,7 +208,7 @@ def generate_chat_prompt(user_input, state, **kwargs): logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n") raise ValueError else: - logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}.") + logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.") break prompt = make_prompt(messages) From 7301c7618feb13936b4a2e70bc8891f786bffa64 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 Feb 2024 21:49:58 -0800 Subject: [PATCH 13/18] Minor change to Models tab --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 98868f2c..87f15c1d 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -133,6 +133,7 @@ def create_ui(): shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') with gr.Blocks(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') @@ -140,7 +141,6 @@ def create_ui(): shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') - shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.') From 8c35fefb3b72b06b13a5ad632873cdc7b0013992 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:20:10 -0300 Subject: [PATCH 14/18] Add custom sampler order support (#5443) --- docs/03 - Parameters Tab.md | 5 +- extensions/openai/typing.py | 1 + modules/loaders.py | 3 + modules/presets.py | 1 + modules/sampler_hijack.py | 294 +++++++++++++++++++++++------------- modules/shared.py | 1 + modules/text_generation.py | 5 + modules/ui.py | 1 + modules/ui_parameters.py | 7 +- 9 files changed, 205 insertions(+), 113 deletions(-) diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md index affa9e73..c274a9c5 100644 --- a/docs/03 - Parameters Tab.md +++ b/docs/03 - Parameters Tab.md @@ -55,8 +55,8 @@ For more information about the parameters, the [transformers documentation](http * **mirostat_tau**: No idea, see the paper for details. According to the Preset Arena, 8 is a good value. * **mirostat_eta**: No idea, see the paper for details. According to the Preset Arena, 0.1 is a good value. * **dynamic_temperature**: Activates Dynamic Temperature. This modifies temperature to range between "dynatemp_low" (minimum) and "dynatemp_high" (maximum), with an entropy-based scaling. The steepness of the curve is controlled by "dynatemp_exponent". -* **smoothing_factor**: Activates Quadratic Sampling. This takes precedence over regular temperature and dynamic temperature, and replaces those samplers. When `0 < smoothing_factor < 1`, the logits distribution becomes flatter. When `smoothing_factor > 1`, it becomes more peaked. -* **temperature_last**: Makes temperature the last sampler instead of the first. With this, you can remove low probability tokens with a sampler like min_p and then use a high temperature to make the model creative without losing coherency. +* **smoothing_factor**: Activates Quadratic Sampling. When `0 < smoothing_factor < 1`, the logits distribution becomes flatter. When `smoothing_factor > 1`, it becomes more peaked. +* **temperature_last**: Makes temperature the last sampler instead of the first. With this, you can remove low probability tokens with a sampler like min_p and then use a high temperature to make the model creative without losing coherency. Note: this parameter takes precedence over "Sampler priority". That means that `temperature`/`dynamic_temperature`/`quadratic_sampling` will be removed from wherever they are and moved to the end of the stack. * **do_sample**: When unchecked, sampling is entirely disabled, and greedy decoding is used instead (the most likely token is always picked). * **Seed**: Set the Pytorch seed to this number. Note that some loaders do not use Pytorch (notably llama.cpp), and others are not deterministic (notably ExLlama v1 and v2). For these loaders, the seed has no effect. * **encoder_repetition_penalty**: Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge. @@ -77,6 +77,7 @@ To the right (or below if you are on mobile), the following parameters are prese * **Add the bos_token to the beginning of prompts**: By default, the tokenizer will add a BOS (Beginning of Sequence) token to your prompt. During training, BOS tokens are used to separate different documents. If unchecked, no BOS token will be added, and the model will interpret your prompt as being in the middle of a document instead of at the start of one. This significantly changes the output and can make it more creative. * **Skip special tokens**: When decoding the generated tokens, skip special tokens from being converted to their text representation. Otherwise, BOS appears as ``, EOS as ``, etc. * **Activate text streaming**: When unchecked, the full response is outputted at once, without streaming the words one at a time. I recommend unchecking this parameter on high latency networks like running the webui on Google Colab or using `--share`. +* **Sampler priority**: Allows you to customize the order in which the different samplers are applied. The first sampler on the list gets applied first. With this, custom orders like `top_p -> temperature -> top_k` can be defined. * **Load grammar from file**: Loads a GBNF grammar from a file under `text-generation-webui/grammars`. The output is written to the "Grammar" box below. You can also save and delete custom grammars using this menu. * **Grammar**: Allows you to constrain the model output to a particular format. For instance, you can make the model generate lists, JSON, specific words, etc. Grammar is extremely powerful and I highly recommend it. The syntax looks a bit daunting at first sight, but it gets very easy once you understand it. See the [GBNF Guide](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md) for details. diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 3deb464f..ec351167 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -40,6 +40,7 @@ class GenerationOptions(BaseModel): max_tokens_second: int = 0 prompt_lookup_num_tokens: int = 0 custom_token_bans: str = "" + sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].") auto_max_new_tokens: bool = False ban_eos_token: bool = False add_bos_token: bool = True diff --git a/modules/loaders.py b/modules/loaders.py index 6107dac7..687a9e92 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -182,6 +182,7 @@ def transformers_samplers(): 'negative_prompt', 'ban_eos_token', 'custom_token_bans', + 'sampler_priority', 'add_bos_token', 'skip_special_tokens', 'auto_max_new_tokens', @@ -230,6 +231,7 @@ loaders_samplers = { 'negative_prompt', 'ban_eos_token', 'custom_token_bans', + 'sampler_priority', 'add_bos_token', 'skip_special_tokens', 'auto_max_new_tokens', @@ -287,6 +289,7 @@ loaders_samplers = { 'negative_prompt', 'ban_eos_token', 'custom_token_bans', + 'sampler_priority', 'add_bos_token', 'skip_special_tokens', 'auto_max_new_tokens', diff --git a/modules/presets.py b/modules/presets.py index 966c706e..2a4a4dde 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -42,6 +42,7 @@ def default_preset(): 'num_beams': 1, 'length_penalty': 1, 'early_stopping': False, + 'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat' } diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py index 59b90b02..9701b034 100644 --- a/modules/sampler_hijack.py +++ b/modules/sampler_hijack.py @@ -1,4 +1,5 @@ import math +import pprint import torch import transformers @@ -6,21 +7,21 @@ from transformers import LogitsWarper, is_torch_xpu_available from transformers.generation.logits_process import ( LogitNormalization, LogitsProcessor, - LogitsProcessorList, - TemperatureLogitsWarper + LogitsProcessorList ) from modules import shared +from modules.logging_colors import logger global_scores = None -class ModifiedTemperatureLogitsWarper(LogitsWarper): +class TemperatureLogitsWarperCustom(LogitsWarper): ''' - Based on the original Transformers temperature logits warper, this - adds support for dynamic temperature and quadratic sampling. + A copy of the original Transformers temperature logits warper. ''' - def __init__(self, temperature: float, dynamic_temperature: bool, dynatemp_low: float, dynatemp_high: float, dynatemp_exponent: float, smoothing_factor: float): + + def __init__(self, temperature: float): if not isinstance(temperature, float) or not (temperature > 0): except_msg = ( f"`temperature` (={temperature}) has to be a strictly positive float, otherwise your next token " @@ -32,81 +33,90 @@ class ModifiedTemperatureLogitsWarper(LogitsWarper): raise ValueError(except_msg) self.temperature = temperature - self.dynamic_temperature = dynamic_temperature + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + scores = scores / self.temperature + return scores + + +class DynamicTemperatureLogitsWarper(LogitsWarper): + ''' + Dynamic temperature. + ''' + + def __init__(self, dynatemp_low: float, dynatemp_high: float, dynatemp_exponent: float): self.dynatemp_low = dynatemp_low self.dynatemp_high = dynatemp_high self.dynatemp_exponent = dynatemp_exponent + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + min_temp = self.dynatemp_low + max_temp = self.dynatemp_high + exponent_val = self.dynatemp_exponent + + # Convert logits to probabilities + probs = torch.softmax(scores, dim=-1) + + # Calculate entropy of the softmax probabilities + entropy = -1.0 * torch.where(probs > 0, probs * torch.log(probs), torch.zeros_like(probs)).sum() + + # Guard against future possible division by zero + entropy = max(entropy, torch.tensor(1e-10)) # Ensures entropy is slightly greater than 0 + + # Any logits which are not -Infinity will be considered for calculating max entropy. + num_valid_tokens = torch.sum(scores > -float('inf')).item() + + # Now, calculate the max entropy by using only the valid tokens' count + max_entropy = math.log(num_valid_tokens) + + # Guard against future possible division by zero + max_entropy = max_entropy if max_entropy > 0.0 else 1e-10 + + # Normalize the entropy + normalized_entropy = entropy / max_entropy + + # Map the normalized entropy to the desired temperature range using the power function + dyn_temp = min_temp + (max_temp - min_temp) * (normalized_entropy.pow(exponent_val)) + + # Apply the dynamically calculated temperature scaling + scores = scores / dyn_temp + + # print("----------------------\nTemperature from generation_config:", self.temperature) + # print("min_temp:", min_temp) + # print("max_temp:", max_temp) + # print("Entropy:", entropy.item()) + # print("Max Possible Entropy considering valid tokens only:", max_entropy) + # print("Normalized Entropy:", normalized_entropy.item()) + # print("Dynamic Temperature (dyn_temp):", dyn_temp.item()) + # print("----------------------") + + # max_prob_token_id = torch.argmax(scores, dim=-1) # Get the token ID with the highest probability + # max_prob_token = shared.tokenizer.convert_ids_to_tokens(int(max_prob_token_id)) # Convert ID to token + # print("--- T=", float(dyn_temp), "token=", max_prob_token, "min=", min_temp, "max=", max_temp, "exponent=", exponent_val) + + return scores + + +class QuadraticSamplingLogitsWarper(LogitsWarper): + ''' + Quadratic sampling. + ''' + + def __init__(self, smoothing_factor: float): self.smoothing_factor = smoothing_factor def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + # Compute the maximum logit value + max_logit = scores.max() - # Quadratic sampling - if self.smoothing_factor > 0: + # Apply the quadratic transformation + transformed_logits = -(self.smoothing_factor * (scores - max_logit)**2) + max_logit - # Compute the maximum logit value - max_logit = scores.max() + # No need to print the top 5 logits since this is not required + # print("Original top 5 logits: ", torch.topk(scores, 5)) + # print("New top 5 logits: ", torch.topk(transformed_logits, 5)) - # Apply the quadratic transformation - transformed_logits = -(self.smoothing_factor * (scores - max_logit)**2) + max_logit - - # No need to print the top 5 logits since this is not required - # print("Original top 5 logits: ", torch.topk(scores, 5)) - # print("New top 5 logits: ", torch.topk(transformed_logits, 5)) - - return transformed_logits - - # Dynamic temperature - elif self.dynamic_temperature: - min_temp = self.dynatemp_low - max_temp = self.dynatemp_high - exponent_val = self.dynatemp_exponent - - # Convert logits to probabilities - probs = torch.softmax(scores, dim=-1) - - # Calculate entropy of the softmax probabilities - entropy = -1.0 * torch.where(probs > 0, probs * torch.log(probs), torch.zeros_like(probs)).sum() - - # Guard against future possible division by zero - entropy = max(entropy, torch.tensor(1e-10)) # Ensures entropy is slightly greater than 0 - - # Any logits which are not -Infinity will be considered for calculating max entropy. - num_valid_tokens = torch.sum(scores > -float('inf')).item() - - # Now, calculate the max entropy by using only the valid tokens' count - max_entropy = math.log(num_valid_tokens) - - # Guard against future possible division by zero - max_entropy = max_entropy if max_entropy > 0.0 else 1e-10 - - # Normalize the entropy - normalized_entropy = entropy / max_entropy - - # Map the normalized entropy to the desired temperature range using the power function - dyn_temp = min_temp + (max_temp - min_temp) * (normalized_entropy.pow(exponent_val)) - - # Apply the dynamically calculated temperature scaling - scores = scores / dyn_temp - - # print("----------------------\nTemperature from generation_config:", self.temperature) - # print("min_temp:", min_temp) - # print("max_temp:", max_temp) - # print("Entropy:", entropy.item()) - # print("Max Possible Entropy considering valid tokens only:", max_entropy) - # print("Normalized Entropy:", normalized_entropy.item()) - # print("Dynamic Temperature (dyn_temp):", dyn_temp.item()) - # print("----------------------") - - # max_prob_token_id = torch.argmax(scores, dim=-1) # Get the token ID with the highest probability - # max_prob_token = shared.tokenizer.convert_ids_to_tokens(int(max_prob_token_id)) # Convert ID to token - # print("--- T=", float(dyn_temp), "token=", max_prob_token, "min=", min_temp, "max=", max_temp, "exponent=", exponent_val) - - return scores - - # Regular temperature - else: - scores = scores / self.temperature - return scores + return transformed_logits class MinPLogitsWarper(LogitsWarper): @@ -209,6 +219,7 @@ class MirostatLogitsWarper(LogitsWarper): def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): if mirostat_mode not in [2]: raise ValueError(f"`mirostat` has to be a an integer 2, but is {mirostat_mode}") + self.mirostat_mode = mirostat_mode self.mirostat_eta = mirostat_eta self.mirostat_tau = mirostat_tau @@ -301,44 +312,74 @@ class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor): def get_logits_warper_patch(self, generation_config): - # Make sure that temperature is float and not int + + # Parameter sanitization if isinstance(generation_config.temperature, int): - generation_config.temperature = float(generation_config.temperature) - - temperature = generation_config.temperature - if generation_config.dynamic_temperature or generation_config.smoothing_factor > 0: - # Make sure TemperatureLogitsWarper will be created by temporarily - # setting temperature to a value != 1. - generation_config.temperature = 1.1 + generation_config.temperature = float(generation_config.temperature) # Must be float + # Get the original warpers warpers = self._get_logits_warper_old(generation_config) + + # Replace temperature with our modified class. + # Currently, it behaves identically to the original. for i in range(len(warpers)): if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper': - warpers[i] = ModifiedTemperatureLogitsWarper( - temperature, - generation_config.dynamic_temperature, - generation_config.dynatemp_low, - generation_config.dynatemp_high, - generation_config.dynatemp_exponent, - generation_config.smoothing_factor + warpers[i] = TemperatureLogitsWarperCustom( + generation_config.temperature, ) + # Add custom warpers warpers_to_add = LogitsProcessorList() min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1 + if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0: + warpers_to_add.append( + TailFreeLogitsWarper( + tfs=generation_config.tfs, + min_tokens_to_keep=min_tokens_to_keep + ) + ) + + if generation_config.top_a is not None and 0.0 < generation_config.top_a <= 1.0: + warpers_to_add.append( + TopALogitsWarper( + top_a=generation_config.top_a, + min_tokens_to_keep=min_tokens_to_keep + ) + ) + + if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0: + warpers_to_add.append( + MinPLogitsWarper( + min_p=generation_config.min_p, + min_tokens_to_keep=min_tokens_to_keep + ) + ) + + if generation_config.dynamic_temperature: + warpers_to_add.append( + DynamicTemperatureLogitsWarper( + dynatemp_low=generation_config.dynatemp_low, + dynatemp_high=generation_config.dynatemp_high, + dynatemp_exponent=generation_config.dynatemp_exponent, + ) + ) + + if generation_config.smoothing_factor > 0: + warpers_to_add.append( + QuadraticSamplingLogitsWarper( + smoothing_factor=generation_config.smoothing_factor + ) + ) if generation_config.mirostat_mode is not None and generation_config.mirostat_mode == 2: - warpers_to_add.append(MirostatLogitsWarper(mirostat_mode=generation_config.mirostat_mode, mirostat_eta=generation_config.mirostat_eta, mirostat_tau=generation_config.mirostat_tau, min_tokens_to_keep=min_tokens_to_keep)) - # We need to disable samplers other than temperature - for warper in warpers: - if not isinstance(warper, TemperatureLogitsWarper): - warpers.remove(warper) - else: - if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0: - warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep)) - if generation_config.top_a is not None and 0.0 < generation_config.top_a <= 1.0: - warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep)) - if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0: - warpers_to_add.append(MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep)) + warpers_to_add.append( + MirostatLogitsWarper( + mirostat_mode=generation_config.mirostat_mode, + mirostat_eta=generation_config.mirostat_eta, + mirostat_tau=generation_config.mirostat_tau, + min_tokens_to_keep=min_tokens_to_keep + ) + ) if len(warpers) > 0 and isinstance(warpers[-1], LogitNormalization): normalize = warpers.pop(-1) @@ -346,23 +387,57 @@ def get_logits_warper_patch(self, generation_config): normalize = None warpers += warpers_to_add - if generation_config.temperature_last: - temperature_idx = None - for i in range(len(warpers)): - if warpers[i].__class__.__name__ in ['TemperatureLogitsWarper', 'ModifiedTemperatureLogitsWarper']: - temperature_idx = i - break - if temperature_idx is not None: - warpers.append(warpers.pop(temperature_idx)) + # Sort the samplers. + sampler_priority = generation_config.sampler_priority + + # Handle temperature_last + if generation_config.temperature_last: + for param_name in ['temperature', 'dynamic_temperature', 'quadratic_sampling']: + if param_name in sampler_priority: + if param_name in sampler_priority: + index = sampler_priority.index(param_name) + sampler_priority.append(sampler_priority.pop(index)) + else: + sampler_priority.append(param_name) + + class_name_to_nickname = { + 'DynamicTemperatureLogitsWarper': 'dynamic_temperature', + 'EpsilonLogitsWarper': 'epsilon_cutoff', + 'EtaLogitsWarper': 'eta_cutoff', + 'MinPLogitsWarper': 'min_p', + 'MirostatLogitsWarper': 'mirostat', + 'QuadraticSamplingLogitsWarper': 'quadratic_sampling', + 'TailFreeLogitsWarper': 'tfs', + 'TemperatureLogitsWarperCustom': 'temperature', + 'TopALogitsWarper': 'top_a', + 'TopKLogitsWarper': 'top_k', + 'TopPLogitsWarper': 'top_p', + 'TypicalLogitsWarper': 'typical_p' + } + + def custom_sort_key(obj): + class_name = obj.__class__.__name__ + + # Return a large value if class name is not mapped or if the mapped nickname is not in priority + if class_name not in class_name_to_nickname or class_name_to_nickname[class_name] not in sampler_priority: + return float('inf') + + # Return the index of the nickname in the priority list for sorting + return sampler_priority.index(class_name_to_nickname[class_name]) + + # Sort the list using the custom key function + warpers = sorted(warpers, key=custom_sort_key) if normalize is not None: warpers.append(normalize) warpers.append(SpyLogitsWarper()) warpers = LogitsProcessorList(warpers) - # for i in range(len(warpers)): - # print(warpers[i].__class__.__name__) + if shared.args.verbose: + logger.info("WARPERS=") + pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint([x.__class__.__name__ for x in warpers]) + return warpers @@ -402,6 +477,7 @@ def generation_config_init_patch(self, **kwargs): self.presence_penalty = kwargs.pop("presence_penalty", 0) self.frequency_penalty = kwargs.pop("frequency_penalty", 0) self.temperature_last = kwargs.pop("temperature_last", False) + self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat']) def hijack_samplers(): diff --git a/modules/shared.py b/modules/shared.py index 38d08349..eea3d27f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -50,6 +50,7 @@ settings = { 'prompt_lookup_num_tokens': 0, 'custom_stopping_strings': '', 'custom_token_bans': '', + 'sampler_priority': 'temperature,top_k,top_p,typical_p,epsilon_cutoff,eta_cutoff,tfs,top_a,min_p,dynamic_temperature,quadratic_sampling,mirostat', 'auto_max_new_tokens': False, 'ban_eos_token': False, 'add_bos_token': True, diff --git a/modules/text_generation.py b/modules/text_generation.py index 04625ab9..1808f8bf 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -291,6 +291,11 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings if k in state: generate_params[k] = state[k] + if isinstance(state['sampler_priority'], list): + generate_params['sampler_priority'] = state['sampler_priority'] + elif isinstance(state['sampler_priority'], str): + generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()] + if state['negative_prompt'] != '': generate_params['negative_prompt_ids'] = encode(state['negative_prompt']) diff --git a/modules/ui.py b/modules/ui.py index acd959a0..06498f69 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -149,6 +149,7 @@ def list_interface_input_elements(): 'add_bos_token', 'ban_eos_token', 'custom_token_bans', + 'sampler_priority', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index a81ed27a..078590dc 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -49,12 +49,12 @@ def create_ui(default_preset): shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.') shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau') shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta') - shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Replaces temperature with Quadratic Sampling.') + shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.') shared.gradio['dynamic_temperature'] = gr.Checkbox(value=generate_params['dynamic_temperature'], label='dynamic_temperature') shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_low'], step=0.01, label='dynatemp_low', visible=generate_params['dynamic_temperature']) shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_high'], step=0.01, label='dynatemp_high', visible=generate_params['dynamic_temperature']) shared.gradio['dynatemp_exponent'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_exponent'], step=0.01, label='dynatemp_exponent', visible=generate_params['dynamic_temperature']) - shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Makes temperature the last sampler instead of the first.') + shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".') shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)') with gr.Accordion('Other parameters', open=False): @@ -85,6 +85,9 @@ def create_ui(default_preset): shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') + with gr.Blocks(): + shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.') + with gr.Row() as shared.gradio['grammar_file_row']: shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label='Load grammar from file (.gbnf)', elem_classes='slim-dropdown') ui.create_refresh_button(shared.gradio['grammar_file'], lambda: None, lambda: {'choices': utils.get_available_grammars()}, 'refresh-button', interactive=not mu) From 2a1063eff5fe5ed7580436a52efc91032800e476 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:21:17 -0800 Subject: [PATCH 15/18] Revert "Remove non-HF ExLlamaV2 loader (#5431)" This reverts commit cde000d47801fa13c5a88f9e435da64132bd96bc. --- modules/LoRA.py | 8 +- modules/exllamav2.py | 149 +++++++++++++++++++++++++++++++++++++ modules/loaders.py | 33 ++++++++ modules/logits.py | 12 ++- modules/models.py | 8 ++ modules/models_settings.py | 2 - modules/shared.py | 14 ++-- modules/text_generation.py | 11 ++- modules/ui_model_menu.py | 1 + 9 files changed, 220 insertions(+), 18 deletions(-) create mode 100644 modules/exllamav2.py diff --git a/modules/LoRA.py b/modules/LoRA.py index 0cb1671e..15132f4e 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -12,7 +12,7 @@ from modules.models import reload_model def add_lora_to_model(lora_names): if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ': add_lora_autogptq(lora_names) - elif shared.model.__class__.__name__ == 'Exllamav2HF' or shared.args.loader == 'ExLlamav2_HF': + elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']: add_lora_exllamav2(lora_names) else: add_lora_transformers(lora_names) @@ -39,7 +39,11 @@ def add_lora_exllamav2(lora_names): shared.model.loras = [] for lora_name in lora_names: lora_path = get_lora_path(lora_name) - lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path)) + if shared.model.__class__.__name__ == 'Exllamav2Model': + lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path)) + else: + lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path)) + shared.model.loras.append(lora) shared.lora_names = lora_names diff --git a/modules/exllamav2.py b/modules/exllamav2.py new file mode 100644 index 00000000..551ed498 --- /dev/null +++ b/modules/exllamav2.py @@ -0,0 +1,149 @@ +import traceback +from pathlib import Path + +import torch +from exllamav2 import ( + ExLlamaV2, + ExLlamaV2Cache, + ExLlamaV2Cache_8bit, + ExLlamaV2Config, + ExLlamaV2Tokenizer +) +from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator + +from modules import shared +from modules.logging_colors import logger +from modules.text_generation import get_max_prompt_length + +try: + import flash_attn +except ModuleNotFoundError: + logger.warning( + 'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage ' + 'to be a lot higher than it could be.\n' + 'Try installing flash-attention following the instructions here: ' + 'https://github.com/Dao-AILab/flash-attention#installation-and-features' + ) + pass +except Exception: + logger.warning('Failed to load flash-attention due to the following error:\n') + traceback.print_exc() + + +class Exllamav2Model: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path_to_model): + + path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) + + config = ExLlamaV2Config() + config.model_dir = str(path_to_model) + config.prepare() + + config.max_seq_len = shared.args.max_seq_len + config.scale_pos_emb = shared.args.compress_pos_emb + config.scale_alpha_value = shared.args.alpha_value + config.no_flash_attn = shared.args.no_flash_attn + config.num_experts_per_token = int(shared.args.num_experts_per_token) + + model = ExLlamaV2(config) + + split = None + if shared.args.gpu_split: + split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] + + model.load(split) + + tokenizer = ExLlamaV2Tokenizer(config) + if shared.args.cache_8bit: + cache = ExLlamaV2Cache_8bit(model) + else: + cache = ExLlamaV2Cache(model) + + generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) + + result = self() + result.model = model + result.cache = cache + result.tokenizer = tokenizer + result.generator = generator + result.loras = None + return result, result + + def encode(self, string, **kwargs): + return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True) + + def decode(self, ids, **kwargs): + if isinstance(ids, list): + ids = torch.tensor([ids]) + elif isinstance(ids, torch.Tensor) and ids.numel() == 1: + ids = ids.view(1, -1) + + return self.tokenizer.decode(ids, decode_special_tokens=True)[0] + + def get_logits(self, token_ids, **kwargs): + self.cache.current_seq_len = 0 + if token_ids.shape[-1] > 1: + self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras) + + return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, loras=self.loras, **kwargs).float().cpu() + + def generate_with_streaming(self, prompt, state): + settings = ExLlamaV2Sampler.Settings() + + settings.token_repetition_penalty = state['repetition_penalty'] + settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range'] + + settings.token_frequency_penalty = state['frequency_penalty'] + settings.token_presence_penalty = state['presence_penalty'] + + settings.temperature = state['temperature'] + settings.top_k = state['top_k'] + settings.top_p = state['top_p'] + settings.top_a = state['top_a'] + settings.min_p = state['min_p'] + settings.tfs = state['tfs'] + settings.typical = state['typical_p'] + + settings.temperature_last = state['temperature_last'] + + settings.mirostat = state['mirostat_mode'] == 2 + settings.mirostat_tau = state['mirostat_tau'] + settings.mirostat_eta = state['mirostat_eta'] + + if state['ban_eos_token']: + settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id]) + + if state['custom_token_bans']: + to_ban = [int(x) for x in state['custom_token_bans'].split(',')] + if len(to_ban) > 0: + settings.disallow_tokens(self.tokenizer, to_ban) + + ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True) + ids = ids[:, -get_max_prompt_length(state):] + + if state['auto_max_new_tokens']: + max_new_tokens = state['truncation_length'] - ids.shape[-1] + else: + max_new_tokens = state['max_new_tokens'] + + self.generator.begin_stream(ids, settings, loras=self.loras) + + decoded_text = '' + for i in range(max_new_tokens): + chunk, eos, _ = self.generator.stream() + if eos or shared.stop_everything: + break + + decoded_text += chunk + yield decoded_text + + def generate(self, prompt, state): + output = '' + for output in self.generate_with_streaming(prompt, state): + pass + + return output diff --git a/modules/loaders.py b/modules/loaders.py index 687a9e92..26b7c5e2 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -83,6 +83,16 @@ loaders_and_params = OrderedDict({ 'trust_remote_code', 'no_use_fast', ], + 'ExLlamav2': [ + 'gpu_split', + 'max_seq_len', + 'no_flash_attn', + 'num_experts_per_token', + 'cache_8bit', + 'alpha_value', + 'compress_pos_emb', + 'exllamav2_info', + ], 'AutoGPTQ': [ 'triton', 'no_inject_fused_attention', @@ -197,6 +207,29 @@ loaders_samplers = { 'AutoAWQ': transformers_samplers(), 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), + 'ExLlamav2': { + 'temperature', + 'temperature_last', + 'top_p', + 'min_p', + 'top_k', + 'typical_p', + 'tfs', + 'top_a', + 'repetition_penalty', + 'presence_penalty', + 'frequency_penalty', + 'repetition_penalty_range', + 'seed', + 'mirostat_mode', + 'mirostat_tau', + 'mirostat_eta', + 'ban_eos_token', + 'add_bos_token', + 'custom_token_bans', + 'skip_special_tokens', + 'auto_max_new_tokens', + }, 'ExLlamav2_HF': { 'temperature', 'temperature_last', diff --git a/modules/logits.py b/modules/logits.py index c2cbd92e..c630be88 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -13,10 +13,11 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return logger.error("No model is loaded! Select one in the Model tab.") return 'Error: No model is loaded1 Select one in the Model tab.', previous + is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel' if use_samplers: - if is_non_hf_llamacpp: + if any([is_non_hf_exllamav2, is_non_hf_llamacpp]): logger.error("Sampler hijacking is not supported non-Huggingface loaders.") # sampling is all done in c for exllama, so it is really hard to hijack # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods, @@ -30,7 +31,13 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return scores = sampler_hijack.global_scores[-1] else: - if is_non_hf_llamacpp: + if is_non_hf_exllamav2: + if is_torch_xpu_available(): + tokens = shared.tokenizer.encode(prompt).to("xpu:0") + else: + tokens = shared.tokenizer.encode(prompt).cuda() + scores = shared.model.get_logits(tokens)[-1][-1] + elif is_non_hf_llamacpp: tokens = shared.tokenizer.encode(prompt) scores = shared.model.get_logits(tokens)[-1][-1] else: @@ -38,7 +45,6 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0") else: tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda() - output = shared.model(input_ids=tokens) scores = output['logits'][-1][-1] diff --git a/modules/models.py b/modules/models.py index ab0e762c..6c38c3c7 100644 --- a/modules/models.py +++ b/modules/models.py @@ -65,6 +65,7 @@ def load_model(model_name, loader=None): 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, + 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, 'AutoAWQ': AutoAWQ_loader, @@ -375,6 +376,13 @@ def AutoGPTQ_loader(model_name): return modules.AutoGPTQ_loader.load_quantized(model_name) +def ExLlamav2_loader(model_name): + from modules.exllamav2 import Exllamav2Model + + model, tokenizer = Exllamav2Model.from_pretrained(model_name) + return model, tokenizer + + def ExLlamav2_HF_loader(model_name): from modules.exllamav2_hf import Exllamav2HF diff --git a/modules/models_settings.py b/modules/models_settings.py index 3e1649aa..9acc7efa 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -141,8 +141,6 @@ def get_model_metadata(model): if re.match(pat.lower(), model.lower()): for k in settings[pat]: model_settings[k] = settings[pat][k] - if k == 'loader' and settings[pat][k] == 'ExLlamav2': - model_settings[k] = 'ExLlamav2_HF' return model_settings diff --git a/modules/shared.py b/modules/shared.py index eea3d27f..78966617 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -89,7 +89,7 @@ group.add_argument('--chat-buttons', action='store_true', help='Show buttons on # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -132,11 +132,11 @@ group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.') -# ExLlamaV2 -group = parser.add_argument_group('ExLlamaV2') +# ExLlama +group = parser.add_argument_group('ExLlama') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') -group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') +group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') @@ -250,7 +250,11 @@ def fix_loader_name(name): return 'AutoGPTQ' elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']: return 'GPTQ-for-LLaMa' - elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2', 'exllama', 'ex-llama', 'ex_llama', 'exlama', 'exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: + elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: + return 'ExLlama' + elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']: + return 'ExLlamav2' + elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']: return 'ctransformers' diff --git a/modules/text_generation.py b/modules/text_generation.py index 1808f8bf..1917a0c1 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -45,7 +45,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -121,11 +121,10 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']: input_ids = shared.tokenizer.encode(str(prompt)) - # The step below is necessary for llama.cpp, but may not be - # necessary for future loaders. - input_ids = np.array(input_ids).reshape(1, len(input_ids)) + if shared.model.__class__.__name__ not in ['Exllamav2Model']: + input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) if not add_bos_token: @@ -136,7 +135,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 87f15c1d..23679097 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -142,6 +142,7 @@ def create_ui(): shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') + shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.') with gr.Column(): From 8a6d9abb414e41333eebf3234b22677db6253626 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:26:27 -0800 Subject: [PATCH 16/18] Small fixes --- docs/04 - Model Tab.md | 4 ++++ docs/What Works.md | 1 + modules/shared.py | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 2585f544..762f85e8 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -47,6 +47,10 @@ Examples: * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). +### ExLlamav2 + +The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library. + ### AutoGPTQ Loads: GPTQ models. diff --git a/docs/What Works.md b/docs/What Works.md index 343343a1..354da1dd 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -6,6 +6,7 @@ | llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | | llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ | | ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | +| ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | | AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | | AutoAWQ | ? | ❌ | ? | ? | ✅ | | GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ | diff --git a/modules/shared.py b/modules/shared.py index 78966617..5c81c1c7 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -50,7 +50,6 @@ settings = { 'prompt_lookup_num_tokens': 0, 'custom_stopping_strings': '', 'custom_token_bans': '', - 'sampler_priority': 'temperature,top_k,top_p,typical_p,epsilon_cutoff,eta_cutoff,tfs,top_a,min_p,dynamic_temperature,quadratic_sampling,mirostat', 'auto_max_new_tokens': False, 'ban_eos_token': False, 'add_bos_token': True, @@ -130,7 +129,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') -group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.') +group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') # ExLlama group = parser.add_argument_group('ExLlama') From 8ee3cea7cb376478236499508ca884fa5fbba1fb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:31:27 -0800 Subject: [PATCH 17/18] Improve some log messages --- modules/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/models.py b/modules/models.py index 6c38c3c7..5929e868 100644 --- a/modules/models.py +++ b/modules/models.py @@ -100,9 +100,9 @@ def load_model(model_name, loader=None): elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: shared.settings['truncation_length'] = shared.args.n_ctx - logger.info(f"LOADER: {loader}") + logger.info(f"LOADER: \"{loader}\"") logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}") - logger.info(f"INSTRUCTION TEMPLATE: {metadata['instruction_template']}") + logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"") logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.") return model, tokenizer From acfbe6b3b3ead65c2985990dcecfae3617e14cb2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:35:01 -0800 Subject: [PATCH 18/18] Minor doc changes --- docs/03 - Parameters Tab.md | 2 +- modules/shared.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md index c274a9c5..ca1c203b 100644 --- a/docs/03 - Parameters Tab.md +++ b/docs/03 - Parameters Tab.md @@ -58,7 +58,7 @@ For more information about the parameters, the [transformers documentation](http * **smoothing_factor**: Activates Quadratic Sampling. When `0 < smoothing_factor < 1`, the logits distribution becomes flatter. When `smoothing_factor > 1`, it becomes more peaked. * **temperature_last**: Makes temperature the last sampler instead of the first. With this, you can remove low probability tokens with a sampler like min_p and then use a high temperature to make the model creative without losing coherency. Note: this parameter takes precedence over "Sampler priority". That means that `temperature`/`dynamic_temperature`/`quadratic_sampling` will be removed from wherever they are and moved to the end of the stack. * **do_sample**: When unchecked, sampling is entirely disabled, and greedy decoding is used instead (the most likely token is always picked). -* **Seed**: Set the Pytorch seed to this number. Note that some loaders do not use Pytorch (notably llama.cpp), and others are not deterministic (notably ExLlama v1 and v2). For these loaders, the seed has no effect. +* **Seed**: Set the Pytorch seed to this number. Note that some loaders do not use Pytorch (notably llama.cpp), and others are not deterministic (ExLlamaV2). For these loaders, the seed has no effect. * **encoder_repetition_penalty**: Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge. * **no_repeat_ngram_size**: If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases. * **min_length**: Minimum generation length in tokens. This is a built-in parameter in the transformers library that has never been very useful. Typically you want to check "Ban the eos_token" instead. diff --git a/modules/shared.py b/modules/shared.py index 5c81c1c7..2861d690 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -131,8 +131,8 @@ group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') -# ExLlama -group = parser.add_argument_group('ExLlama') +# ExLlamaV2 +group = parser.add_argument_group('ExLlamaV2') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')