Documentation
¶
Index ¶
- Constants
- Variables
- func DetectGGMLType(b []byte) string
- func ParseFileType(s string) (fileType, error)
- func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, ...) (bool, uint64)
- func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error
- type CompletionRequest
- type CompletionResponse
- type DetokenizeRequest
- type DetokenizeResponse
- type EmbeddingRequest
- type EmbeddingResponse
- type GGML
- type ImageData
- type KV
- func (kv KV) Architecture() string
- func (kv KV) BlockCount() uint64
- func (kv KV) ChatTemplate() string
- func (kv KV) ContextLength() uint64
- func (kv KV) EmbeddingHeadCount() uint64
- func (kv KV) EmbeddingHeadCountK() uint64
- func (kv KV) EmbeddingHeadCountV() uint64
- func (kv KV) EmbeddingLength() uint64
- func (kv KV) FileType() fileType
- func (kv KV) GQA() uint64
- func (kv KV) HeadCount() uint64
- func (kv KV) HeadCountKV() uint64
- func (kv KV) Kind() string
- func (kv KV) ParameterCount() uint64
- type Layer
- type LlamaServer
- type MemoryEstimate
- type ServerStatus
- type ServerStatusResp
- type StatusWriter
- type Tensor
- type Tensors
- type TokenizeRequest
- type TokenizeResponse
Constants ¶
const ( // Magic constant for `ggml` files (unversioned). FILE_MAGIC_GGML = 0x67676d6c // Magic constant for `ggml` files (versioned, ggmf). FILE_MAGIC_GGMF = 0x67676d66 // Magic constant for `ggml` files (versioned, ggjt). FILE_MAGIC_GGJT = 0x67676a74 // Magic constant for `ggla` files (LoRA adapter). FILE_MAGIC_GGLA = 0x67676C61 // Magic constant for `gguf` files (versioned, gguf) FILE_MAGIC_GGUF_LE = 0x46554747 FILE_MAGIC_GGUF_BE = 0x47475546 )
Variables ¶
var ErrUnsupportedFormat = errors.New("unsupported model format")
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
Functions ¶
func DetectGGMLType ¶
func ParseFileType ¶
func PredictServerFit ¶
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64)
This algorithm looks for a complete fit to determine if we need to unload other models
Types ¶
type CompletionRequest ¶
type CompletionResponse ¶
type DetokenizeRequest ¶
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse ¶
type DetokenizeResponse struct {
Content string `json:"content"`
}
type EmbeddingRequest ¶
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse ¶
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
}
type GGML ¶
type GGML struct {
// contains filtered or unexported fields
}
func DecodeGGML ¶
DecodeGGML decodes a GGML model from the given reader.
It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.
func LoadModel ¶
LoadModel will load a model from disk. The model must be in the GGML format.
It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.
func (GGML) SupportsFlashAttention ¶
SupportsFlashAttention checks if the model supports flash attention
func (GGML) SupportsKVCacheType ¶
SupportsKVCacheType checks if the requested cache type is supported
type KV ¶
func (KV) Architecture ¶
func (KV) BlockCount ¶
func (KV) ChatTemplate ¶
func (KV) ContextLength ¶
func (KV) EmbeddingHeadCount ¶
func (KV) EmbeddingHeadCountK ¶
func (KV) EmbeddingHeadCountV ¶
func (KV) EmbeddingLength ¶
func (KV) HeadCountKV ¶
func (KV) ParameterCount ¶
type LlamaServer ¶
type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embedding(ctx context.Context, input string) ([]float32, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
}
func NewLlamaServer ¶
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)
NewLlamaServer will run a server for the given GPUs The gpu list must be a single family.
type MemoryEstimate ¶
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
// The size of the graph which occupies the main GPU
Graph uint64
// How much VRAM will be allocated given the number of layers we predict
VRAMSize uint64
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
TotalSize uint64
// For multi-GPU scenarios, this provides the tensor split parameter
TensorSplit string
// For multi-GPU scenarios, this is the size in bytes per GPU
GPUSizes []uint64
// contains filtered or unexported fields
}
func EstimateGPULayers ¶
func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate
Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size The GPUs provided must all be the same Library
type ServerStatus ¶
type ServerStatus int
const ( ServerStatusReady ServerStatus = iota ServerStatusNoSlotsAvailable ServerStatusLoadingModel ServerStatusNotResponding ServerStatusError )
func (ServerStatus) ToString ¶
func (s ServerStatus) ToString() string
type ServerStatusResp ¶
type StatusWriter ¶
type StatusWriter struct {
LastErrMsg string
// contains filtered or unexported fields
}
StatusWriter is a writer that captures error messages from the llama runner process
func NewStatusWriter ¶
func NewStatusWriter(out *os.File) *StatusWriter
type Tensor ¶
type TokenizeRequest ¶
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse ¶
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}