Files
omnia-langchain/llm_runtime.py

34 lines
971 B
Python

import logging
import multiprocessing
from langchain_community.chat_models import ChatLlamaCpp
from langchain_community.embeddings import HuggingFaceEmbeddings
DEFAULT_MODEL_PATH = "/home/sortedcord/.cache/huggingface/hub/models--ggml-org--gemma-4-E4B-it-GGUF/snapshots/6b352c53e1d2e4bb974d9f8cafcf85887c224219/gemma-4-e4b-it-Q4_K_M.gguf"
logger = logging.getLogger(__name__)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llm = ChatLlamaCpp(
temperature=0.2,
model_path=DEFAULT_MODEL_PATH,
n_ctx=4096,
n_gpu_layers=11,
max_tokens=512,
n_threads=multiprocessing.cpu_count() - 1,
repeat_penalty=1.5,
verbose=False,
)
def _format_prompt(messages):
formatted = []
for message in messages:
formatted.append(f"{message.__class__.__name__}:\n{message.content}")
return "\n\n".join(formatted)
def _normalize_llm_output(text: str) -> str:
return text.replace("\r", "").replace("\n", "").strip()