llm

package

v0.0.1 Latest Latest Go to latest Published: Feb 1, 2025 License: MIT Imports: 32 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/zhangyiming748/ollama

Links

Open Source Insights

Documentation ¶

Rendered for

Index ¶

Constants
Variables
func DetectGGMLType(b []byte) string
func ParseFileType(s string) (fileType, error)
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, ...) (bool, uint64)
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error
type CompletionRequest
type CompletionResponse
type DetokenizeRequest
type DetokenizeResponse
type EmbeddingRequest
type EmbeddingResponse
type GGML
- func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error)
- func LoadModel(model string, maxArraySize int) (*GGML, error)
- func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64)
- func (ggml GGML) SupportsFlashAttention() bool
- func (ggml GGML) SupportsKVCacheType(cacheType string) bool
type ImageData
type KV
- func (kv KV) Architecture() string
- func (kv KV) BlockCount() uint64
- func (kv KV) ChatTemplate() string
- func (kv KV) ContextLength() uint64
- func (kv KV) EmbeddingHeadCount() uint64
- func (kv KV) EmbeddingHeadCountK() uint64
- func (kv KV) EmbeddingHeadCountV() uint64
- func (kv KV) EmbeddingLength() uint64
- func (kv KV) FileType() fileType
- func (kv KV) GQA() uint64
- func (kv KV) HeadCount() uint64
- func (kv KV) HeadCountKV() uint64
- func (kv KV) Kind() string
- func (kv KV) ParameterCount() uint64
type Layer
type LlamaServer
- func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, ...) (LlamaServer, error)
type MemoryEstimate
- func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate
type ServerStatus
- func (s ServerStatus) ToString() string
type ServerStatusResp
type StatusWriter
- func NewStatusWriter(out *os.File) *StatusWriter
- func (w *StatusWriter) Write(b []byte) (int, error)
type Tensor
- func (t Tensor) Size() uint64
type Tensors
- func (ts *Tensors) Layers() map[string]Layer
type TokenizeRequest
type TokenizeResponse

Constants ¶

View Source

const (
	// Magic constant for `ggml` files (unversioned).
	FILE_MAGIC_GGML = 0x67676d6c
	// Magic constant for `ggml` files (versioned, ggmf).
	FILE_MAGIC_GGMF = 0x67676d66
	// Magic constant for `ggml` files (versioned, ggjt).
	FILE_MAGIC_GGJT = 0x67676a74
	// Magic constant for `ggla` files (LoRA adapter).
	FILE_MAGIC_GGLA = 0x67676C61
	// Magic constant for `gguf` files (versioned, gguf)
	FILE_MAGIC_GGUF_LE = 0x46554747
	FILE_MAGIC_GGUF_BE = 0x47475546
)

Variables ¶

View Source

var ErrUnsupportedFormat = errors.New("unsupported model format")

View Source

var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

Functions ¶

func DetectGGMLType ¶

func DetectGGMLType(b []byte) string

func ParseFileType ¶

func ParseFileType(s string) (fileType, error)

func PredictServerFit ¶

func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64)

This algorithm looks for a complete fit to determine if we need to unload other models

func WriteGGUF ¶

func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error

Types ¶

type CompletionRequest ¶

type CompletionRequest struct {
	Prompt  string
	Format  json.RawMessage
	Images  []ImageData
	Options *api.Options
}

type CompletionResponse ¶

type CompletionResponse struct {
	Content            string
	DoneReason         string
	Done               bool
	PromptEvalCount    int
	PromptEvalDuration time.Duration
	EvalCount          int
	EvalDuration       time.Duration
}

type DetokenizeRequest ¶

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse ¶

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type EmbeddingRequest ¶

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse ¶

type EmbeddingResponse struct {
	Embedding []float32 `json:"embedding"`
}

type GGML ¶

type GGML struct {
	// contains filtered or unexported fields
}

func DecodeGGML ¶

func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error)

DecodeGGML decodes a GGML model from the given reader.

It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.

func LoadModel ¶

func LoadModel(model string, maxArraySize int) (*GGML, error)

LoadModel will load a model from disk. The model must be in the GGML format.

It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.

func (GGML) GraphSize ¶

func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64)

func (GGML) SupportsFlashAttention ¶

func (ggml GGML) SupportsFlashAttention() bool

SupportsFlashAttention checks if the model supports flash attention

func (GGML) SupportsKVCacheType ¶

func (ggml GGML) SupportsKVCacheType(cacheType string) bool

SupportsKVCacheType checks if the requested cache type is supported

type ImageData ¶

type ImageData struct {
	Data          []byte `json:"data"`
	ID            int    `json:"id"`
	AspectRatioID int    `json:"aspect_ratio_id"`
}

type KV ¶

type KV map[string]any

func (KV) Architecture ¶

func (kv KV) Architecture() string

func (KV) BlockCount ¶

func (kv KV) BlockCount() uint64

func (KV) ChatTemplate ¶

func (kv KV) ChatTemplate() string

func (KV) ContextLength ¶

func (kv KV) ContextLength() uint64

func (KV) EmbeddingHeadCount ¶

func (kv KV) EmbeddingHeadCount() uint64

func (KV) EmbeddingHeadCountK ¶

func (kv KV) EmbeddingHeadCountK() uint64

func (KV) EmbeddingHeadCountV ¶

func (kv KV) EmbeddingHeadCountV() uint64

func (KV) EmbeddingLength ¶

func (kv KV) EmbeddingLength() uint64

func (KV) FileType ¶

func (kv KV) FileType() fileType

func (KV) GQA ¶

func (kv KV) GQA() uint64

func (KV) HeadCount ¶

func (kv KV) HeadCount() uint64

func (KV) HeadCountKV ¶

func (kv KV) HeadCountKV() uint64

func (KV) Kind ¶

func (kv KV) Kind() string

func (KV) ParameterCount ¶

func (kv KV) ParameterCount() uint64

type Layer ¶

type Layer map[string]*Tensor

type LlamaServer ¶

type LlamaServer interface {
	Ping(ctx context.Context) error
	WaitUntilRunning(ctx context.Context) error
	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
	Embedding(ctx context.Context, input string) ([]float32, error)
	Tokenize(ctx context.Context, content string) ([]int, error)
	Detokenize(ctx context.Context, tokens []int) (string, error)
	Close() error
	EstimatedVRAM() uint64 // Total VRAM across all GPUs
	EstimatedTotal() uint64
	EstimatedVRAMByGPU(gpuID string) uint64
}

func NewLlamaServer ¶

func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)

NewLlamaServer will run a server for the given GPUs The gpu list must be a single family.

type MemoryEstimate ¶

type MemoryEstimate struct {
	// How many layers we predict we can load
	Layers int

	// The size of the graph which occupies the main GPU
	Graph uint64

	// How much VRAM will be allocated given the number of layers we predict
	VRAMSize uint64

	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
	TotalSize uint64

	// For multi-GPU scenarios, this provides the tensor split parameter
	TensorSplit string

	// For multi-GPU scenarios, this is the size in bytes per GPU
	GPUSizes []uint64
	// contains filtered or unexported fields
}

func EstimateGPULayers ¶

func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate

Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size The GPUs provided must all be the same Library

type ServerStatus ¶

type ServerStatus int

const (
	ServerStatusReady ServerStatus = iota
	ServerStatusNoSlotsAvailable
	ServerStatusLoadingModel
	ServerStatusNotResponding
	ServerStatusError
)

func (ServerStatus) ToString ¶

func (s ServerStatus) ToString() string

type ServerStatusResp ¶

type ServerStatusResp struct {
	Status          string  `json:"status"`
	SlotsIdle       int     `json:"slots_idle"`
	SlotsProcessing int     `json:"slots_processing"`
	Error           string  `json:"error"`
	Progress        float32 `json:"progress"`
}

type StatusWriter ¶

type StatusWriter struct {
	LastErrMsg string
	// contains filtered or unexported fields
}

StatusWriter is a writer that captures error messages from the llama runner process

func NewStatusWriter ¶

func NewStatusWriter(out *os.File) *StatusWriter

func (*StatusWriter) Write ¶

func (w *StatusWriter) Write(b []byte) (int, error)

type Tensor ¶

type Tensor struct {
	Name   string `json:"name"`
	Kind   uint32 `json:"kind"`
	Offset uint64 `json:"-"`

	// Shape is the number of elements in each dimension
	Shape []uint64 `json:"shape"`

	io.WriterTo `json:"-"`
}

func (Tensor) Size ¶

func (t Tensor) Size() uint64

type Tensors ¶

type Tensors struct {
	Items  []*Tensor
	Offset uint64
	// contains filtered or unexported fields
}

func (*Tensors) Layers ¶

func (ts *Tensors) Layers() map[string]Layer

type TokenizeRequest ¶

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse ¶

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL