AutoGPT/autogpt/scripts/llamafile/serve.py

166 lines
5.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model
Usage:
cd <repo-root>/autogpt
./scripts/llamafile/serve.py
"""
import os
import platform
import subprocess
from pathlib import Path
from typing import Optional
import click
LLAMAFILE = Path("mistral-7b-instruct-v0.2.Q5_K_M.llamafile")
LLAMAFILE_URL = f"https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/{LLAMAFILE.name}" # noqa
LLAMAFILE_EXE = Path("llamafile.exe")
LLAMAFILE_EXE_URL = "https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6" # noqa
@click.command()
@click.option(
"--llamafile",
type=click.Path(dir_okay=False, path_type=Path),
help=f"Name of the llamafile to serve. Default: {LLAMAFILE.name}",
)
@click.option("--llamafile_url", help="Download URL for the llamafile you want to use")
@click.option(
"--host", help="Specify the address for the llamafile server to listen on"
)
@click.option(
"--port", type=int, help="Specify the port for the llamafile server to listen on"
)
@click.option(
"--force-gpu",
is_flag=True,
hidden=platform.system() != "Darwin",
help="Run the model using only the GPU (AMD or Nvidia). "
"Otherwise, both CPU and GPU may be (partially) used.",
)
def main(
llamafile: Optional[Path] = None,
llamafile_url: Optional[str] = None,
host: Optional[str] = None,
port: Optional[int] = None,
force_gpu: bool = False,
):
print(f"type(llamafile) = {type(llamafile)}")
if not llamafile:
if not llamafile_url:
llamafile = LLAMAFILE
else:
llamafile = Path(llamafile_url.rsplit("/", 1)[1])
if llamafile.suffix != ".llamafile":
click.echo(
click.style(
"The given URL does not end with '.llamafile' -> "
"can't get filename from URL. "
"Specify the filename using --llamafile.",
fg="red",
),
err=True,
)
return
if llamafile == LLAMAFILE and not llamafile_url:
llamafile_url = LLAMAFILE_URL
elif llamafile_url != LLAMAFILE_URL:
if not click.prompt(
click.style(
"You seem to have specified a different URL for the default model "
f"({llamafile.name}). Are you sure this is correct? "
"If you want to use a different model, also specify --llamafile.",
fg="yellow",
),
type=bool,
):
return
# Go to autogpt/scripts/llamafile/
os.chdir(Path(__file__).resolve().parent)
on_windows = platform.system() == "Windows"
if not llamafile.is_file():
if not llamafile_url:
click.echo(
click.style(
"Please use --lamafile_url to specify a download URL for "
f"'{llamafile.name}'. "
"This will only be necessary once, so we can download the model.",
fg="red",
),
err=True,
)
return
download_file(llamafile_url, llamafile)
if not on_windows:
llamafile.chmod(0o755)
subprocess.run([llamafile, "--version"], check=True)
if not on_windows:
base_command = [f"./{llamafile}"]
else:
# Windows does not allow executables over 4GB, so we have to download a
# model-less llamafile.exe and run that instead.
if not LLAMAFILE_EXE.is_file():
download_file(LLAMAFILE_EXE_URL, LLAMAFILE_EXE)
LLAMAFILE_EXE.chmod(0o755)
subprocess.run([f".\\{LLAMAFILE_EXE}", "--version"], check=True)
base_command = [f".\\{LLAMAFILE_EXE}", "-m", llamafile]
if host:
base_command.extend(["--host", host])
if port:
base_command.extend(["--port", str(port)])
if force_gpu:
base_command.extend(["-ngl", "9999"])
subprocess.run(
[
*base_command,
"--server",
"--nobrowser",
"--ctx-size",
"0",
"--n-predict",
"1024",
],
check=True,
)
# note: --ctx-size 0 means the prompt context size will be set directly from the
# underlying model configuration. This may cause slow response times or consume
# a lot of memory.
def download_file(url: str, to_file: Path) -> None:
print(f"Downloading {to_file.name}...")
import urllib.request
urllib.request.urlretrieve(url, to_file, reporthook=report_download_progress)
print()
def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
if total_size != -1:
downloaded_size = chunk_number * chunk_size
percent = min(1, downloaded_size / total_size)
bar = "#" * int(40 * percent)
print(
f"\rDownloading: [{bar:<40}] {percent:.0%}"
f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
end="",
)
if __name__ == "__main__":
main()