301 lines
11 KiB
Nix
301 lines
11 KiB
Nix
{
|
|
self,
|
|
lib,
|
|
testers,
|
|
fetchzip,
|
|
fetchurl,
|
|
writers,
|
|
symlinkJoin,
|
|
jq,
|
|
prom2json,
|
|
}:
|
|
let
|
|
common-config =
|
|
{ config, ... }:
|
|
{
|
|
imports = [ ./module.nix ];
|
|
services.local-ai = {
|
|
enable = true;
|
|
package = self;
|
|
threads = config.virtualisation.cores;
|
|
logLevel = "debug";
|
|
};
|
|
};
|
|
|
|
inherit (self.lib) genModels;
|
|
in
|
|
{
|
|
version = testers.testVersion {
|
|
package = self;
|
|
version = "v" + self.version;
|
|
command = "local-ai --help";
|
|
};
|
|
|
|
health = testers.runNixOSTest {
|
|
name = self.name + "-health";
|
|
nodes.machine = {
|
|
imports = [ common-config ];
|
|
virtualisation.memorySize = 2048;
|
|
};
|
|
testScript =
|
|
let
|
|
port = "8080";
|
|
in
|
|
''
|
|
machine.wait_for_open_port(${port})
|
|
machine.succeed("curl -f http://localhost:${port}/readyz")
|
|
|
|
machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json")
|
|
machine.copy_from_vm("metrics.json")
|
|
'';
|
|
};
|
|
|
|
}
|
|
// lib.optionalAttrs (!self.features.with_cublas) {
|
|
# https://localai.io/features/embeddings/#bert-embeddings
|
|
bert =
|
|
let
|
|
model = "embedding";
|
|
model-configs.${model} = {
|
|
# Note: q4_0 and q4_1 models can not be loaded
|
|
parameters.model = fetchurl {
|
|
url = "https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-f16.bin";
|
|
hash = "sha256-nBlbJFOk/vYKT2vjqIo5IRNmIU32SYpP5IhcniIxT1A=";
|
|
};
|
|
backend = "bert-embeddings";
|
|
embeddings = true;
|
|
};
|
|
|
|
models = genModels model-configs;
|
|
|
|
requests.request = {
|
|
inherit model;
|
|
input = "Your text string goes here";
|
|
};
|
|
in
|
|
testers.runNixOSTest {
|
|
name = self.name + "-bert";
|
|
nodes.machine = {
|
|
imports = [ common-config ];
|
|
virtualisation.cores = 2;
|
|
virtualisation.memorySize = 4096;
|
|
services.local-ai.models = models;
|
|
};
|
|
passthru = {
|
|
inherit models requests;
|
|
};
|
|
testScript =
|
|
let
|
|
port = "8080";
|
|
in
|
|
''
|
|
machine.wait_for_open_port(${port})
|
|
machine.succeed("curl -f http://localhost:${port}/readyz")
|
|
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/embeddings --json @${writers.writeJSON "request.json" requests.request} --output embeddings.json")
|
|
machine.copy_from_vm("embeddings.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .model == \"${model}\"' embeddings.json")
|
|
|
|
machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json")
|
|
machine.copy_from_vm("metrics.json")
|
|
'';
|
|
};
|
|
|
|
}
|
|
// lib.optionalAttrs (!self.features.with_cublas && !self.features.with_clblas) {
|
|
# https://localai.io/docs/getting-started/manual/
|
|
llama =
|
|
let
|
|
model = "gpt-3.5-turbo";
|
|
|
|
# https://localai.io/advanced/#full-config-model-file-reference
|
|
model-configs.${model} = rec {
|
|
context_size = 16 * 1024; # 128kb is possible, but needs 16GB RAM
|
|
backend = "llama-cpp";
|
|
parameters = {
|
|
# https://ai.meta.com/blog/meta-llama-3-1/
|
|
model = fetchurl {
|
|
url = "https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf";
|
|
hash = "sha256-8r4+GiOcEsnz8BqWKxH7KAf4Ay/bY7ClUC6kLd71XkQ=";
|
|
};
|
|
# defaults from:
|
|
# https://deepinfra.com/meta-llama/Meta-Llama-3.1-8B-Instruct
|
|
temperature = 0.7;
|
|
top_p = 0.9;
|
|
top_k = 0;
|
|
# following parameter leads to outputs like: !!!!!!!!!!!!!!!!!!!
|
|
#repeat_penalty = 1;
|
|
presence_penalty = 0;
|
|
frequency_penalty = 0;
|
|
max_tokens = 100;
|
|
};
|
|
stopwords = [ "<|eot_id|>" ];
|
|
template = {
|
|
# Templates implement following specifications
|
|
# https://github.com/meta-llama/llama3/tree/main?tab=readme-ov-file#instruction-tuned-models
|
|
# ... and are insprired by:
|
|
# https://github.com/mudler/LocalAI/blob/master/embedded/models/llama3-instruct.yaml
|
|
#
|
|
# The rules for template evaluateion are defined here:
|
|
# https://pkg.go.dev/text/template
|
|
chat_message = ''
|
|
<|start_header_id|>{{.RoleName}}<|end_header_id|>
|
|
|
|
{{.Content}}${builtins.head stopwords}'';
|
|
|
|
chat = "{{.Input}}<|start_header_id|>assistant<|end_header_id|>";
|
|
|
|
completion = "{{.Input}}";
|
|
};
|
|
};
|
|
|
|
models = genModels model-configs;
|
|
|
|
requests = {
|
|
# https://localai.io/features/text-generation/#chat-completions
|
|
chat-completions = {
|
|
inherit model;
|
|
messages = [
|
|
{
|
|
role = "user";
|
|
content = "1 + 2 = ?";
|
|
}
|
|
];
|
|
};
|
|
# https://localai.io/features/text-generation/#edit-completions
|
|
edit-completions = {
|
|
inherit model;
|
|
instruction = "rephrase";
|
|
input = "Black cat jumped out of the window";
|
|
max_tokens = 50;
|
|
};
|
|
# https://localai.io/features/text-generation/#completions
|
|
completions = {
|
|
inherit model;
|
|
prompt = "A long time ago in a galaxy far, far away";
|
|
};
|
|
};
|
|
in
|
|
testers.runNixOSTest {
|
|
name = self.name + "-llama";
|
|
nodes.machine = {
|
|
imports = [ common-config ];
|
|
virtualisation.cores = 4;
|
|
virtualisation.memorySize = 8192;
|
|
services.local-ai.models = models;
|
|
# TODO: Add test case parallel requests
|
|
services.local-ai.parallelRequests = 2;
|
|
};
|
|
passthru = {
|
|
inherit models requests;
|
|
};
|
|
testScript =
|
|
let
|
|
port = "8080";
|
|
in
|
|
''
|
|
machine.wait_for_open_port(${port})
|
|
machine.succeed("curl -f http://localhost:${port}/readyz")
|
|
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" requests.chat-completions} --output chat-completions.json")
|
|
machine.copy_from_vm("chat-completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .choices | first.message.content | split(\" \") | last | tonumber == 3' chat-completions.json")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" requests.edit-completions} --output edit-completions.json")
|
|
machine.copy_from_vm("edit-completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString requests.edit-completions.max_tokens}' edit-completions.json")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" requests.completions} --output completions.json")
|
|
machine.copy_from_vm("completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${
|
|
toString model-configs.${model}.parameters.max_tokens
|
|
}' completions.json")
|
|
|
|
machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json")
|
|
machine.copy_from_vm("metrics.json")
|
|
'';
|
|
};
|
|
|
|
}
|
|
//
|
|
lib.optionalAttrs
|
|
(self.features.with_tts && !self.features.with_cublas && !self.features.with_clblas)
|
|
{
|
|
# https://localai.io/features/text-to-audio/#piper
|
|
tts =
|
|
let
|
|
model-stt = "whisper-en";
|
|
model-configs.${model-stt} = {
|
|
backend = "whisper";
|
|
parameters.model = fetchurl {
|
|
url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
|
|
hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
|
|
};
|
|
};
|
|
|
|
model-tts = "piper-en";
|
|
model-configs.${model-tts} = {
|
|
backend = "piper";
|
|
parameters.model = "en-us-danny-low.onnx";
|
|
};
|
|
|
|
models =
|
|
let
|
|
models = genModels model-configs;
|
|
in
|
|
symlinkJoin {
|
|
inherit (models) name;
|
|
paths = [
|
|
models
|
|
(fetchzip {
|
|
url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
|
|
hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
|
|
stripRoot = false;
|
|
})
|
|
];
|
|
};
|
|
|
|
requests.request = {
|
|
model = model-tts;
|
|
input = "Hello, how are you?";
|
|
};
|
|
in
|
|
testers.runNixOSTest {
|
|
name = self.name + "-tts";
|
|
nodes.machine = {
|
|
imports = [ common-config ];
|
|
virtualisation.cores = 2;
|
|
services.local-ai.models = models;
|
|
};
|
|
passthru = {
|
|
inherit models requests;
|
|
};
|
|
testScript =
|
|
let
|
|
port = "8080";
|
|
in
|
|
''
|
|
machine.wait_for_open_port(${port})
|
|
machine.succeed("curl -f http://localhost:${port}/readyz")
|
|
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug' models.json")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" requests.request} --output out.wav")
|
|
machine.copy_from_vm("out.wav")
|
|
|
|
machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${model-stt} --output transcription.json")
|
|
machine.copy_from_vm("transcription.json")
|
|
machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${requests.request.input}\"' transcription.json")
|
|
|
|
machine.succeed("${prom2json}/bin/prom2json http://localhost:${port}/metrics > metrics.json")
|
|
machine.copy_from_vm("metrics.json")
|
|
'';
|
|
};
|
|
}
|