gptbot

package module

v0.0.0-...-850542d Latest Latest Go to latest Published: Apr 21, 2023 License: MIT Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/coseyo/gptbot

Links

Open Source Insights

README ¶

GPTBot

Question Answering Bot powered by OpenAI GPT models.

Installation

$ go get -u github.com/coseyo/gptbot

Quick Start

func main() {
    ctx := context.Background()
    apiKey := os.Getenv("OPENAI_API_KEY")
    encoder := gptbot.NewOpenAIEncoder(apiKey, "")
    store := gptbot.NewLocalVectorStore()

    // Feed documents into the vector store.
    feeder := gptbot.NewFeeder(&gptbot.FeederConfig{
        Encoder: encoder,
        Updater: store,
    })
    err := feeder.Feed(ctx, &gptbot.Document{
        ID:   "1",
        Text: "Generative Pre-trained Transformer 3 (GPT-3) is an autoregressive language model released in 2020 that uses deep learning to produce human-like text. Given an initial text as prompt, it will produce text that continues the prompt.",
    })
    if err != nil {
        fmt.Printf("err: %v", err)
        return
    }

    // Chat with the bot to get answers.
    bot := gptbot.NewBot(&gptbot.BotConfig{
        APIKey:  apiKey,
        Encoder: encoder,
        Querier: store,
    })

    question := "When was GPT-3 released?"
    answer, err := bot.Chat(ctx, question)
    if err != nil {
        fmt.Printf("err: %v", err)
        return
    }
    fmt.Printf("Q: %s\n", question)
    fmt.Printf("A: %s\n", answer)

    // Output:
    //
    // Q: When was GPT-3 released?
    // A: GPT-3 was released in 2020.
}

NOTE:

The above example uses a local vector store. If you have a larger dataset, please consider using a vector search engine (e.g. Milvus).
With the help of GPTBot Server, you can even upload documents as files and then start chatting via HTTP!

Design

GPTBot is an implementation of the method demonstrated in Question Answering using Embeddings.

architecture

Core Concepts

Concepts	Description	Built-in Support
Preprocessor	Preprocess the documents by splitting them into chunks.	✅[customizable] Preprocessor
Encoder	Creates an embedding vector for each chunk.	✅[customizable] OpenAIEncoder
VectorStore	Stores and queries document chunk embeddings.	✅[customizable] LocalVectorStore Milvus
Feeder	Feeds the documents into the vector store.	/
Bot	Question answering bot to chat with.	/

License

MIT

Documentation ¶

Overview ¶

Example ¶

package main

import (
	"context"
	"fmt"
	"os"

	"github.com/coseyo/gptbot"
)

func main() {
	ctx := context.Background()
	apiKey := os.Getenv("OPENAI_API_KEY")
	encoder := gptbot.NewOpenAIEncoder(apiKey, "")
	store := gptbot.NewLocalVectorStore()

	// Feed documents into the vector store.
	feeder := gptbot.NewFeeder(&gptbot.FeederConfig{
		Encoder: encoder,
		Updater: store,
	})
	err := feeder.Feed(ctx, &gptbot.Document{
		ID:   "1",
		Text: "Generative Pre-trained Transformer 3 (GPT-3) is an autoregressive language model released in 2020 that uses deep learning to produce human-like text. Given an initial text as prompt, it will produce text that continues the prompt.\n\nThe architecture is a decoder-only transformer network with a 2048-token-long context and then-unprecedented size of 175 billion parameters, requiring 800GB to store. The model was trained using generative pre-training; it is trained to predict what the next token is based on previous tokens. The model demonstrated strong zero-shot and few-shot learning on many tasks.[2]",
	})
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}

	// Chat with the bot to get answers.
	bot := gptbot.NewBot(&gptbot.BotConfig{
		APIKey:  apiKey,
		Encoder: encoder,
		Querier: store,
	})

	question := "When was GPT-3 released?"
	answer, err := bot.Chat(ctx, question)
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}
	fmt.Printf("Q: %s\n", question)
	fmt.Printf("A: %s\n", answer)

	question = "How many parameters does GPT-3 use?"
	answer, err = bot.Chat(ctx, question)
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}
	fmt.Printf("Q: %s\n", question)
	fmt.Printf("A: %s\n", answer)

}

Output:


Q: When was GPT-3 released?
A: GPT-3 was released in 2020.
Q: How many parameters does GPT-3 use?
A: GPT-3 uses 175 billion parameters.

Example (MultiTurn) ¶

package main

import (
	"context"
	"fmt"
	"os"

	"github.com/coseyo/gptbot"
)

func main() {
	ctx := context.Background()
	apiKey := os.Getenv("OPENAI_API_KEY")
	encoder := gptbot.NewOpenAIEncoder(apiKey, "")
	store := gptbot.NewLocalVectorStore()

	// Feed documents into the vector store.
	feeder := gptbot.NewFeeder(&gptbot.FeederConfig{
		Encoder: encoder,
		Updater: store,
	})
	err := feeder.Feed(ctx, &gptbot.Document{
		ID:   "1",
		Text: "Generative Pre-trained Transformer 3 (GPT-3) is an autoregressive language model released in 2020 that uses deep learning to produce human-like text. Given an initial text as prompt, it will produce text that continues the prompt.\n\nThe architecture is a decoder-only transformer network with a 2048-token-long context and then-unprecedented size of 175 billion parameters, requiring 800GB to store. The model was trained using generative pre-training; it is trained to predict what the next token is based on previous tokens. The model demonstrated strong zero-shot and few-shot learning on many tasks.[2]",
	})
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}

	// Chat with the bot to get answers.
	bot := gptbot.NewBot(&gptbot.BotConfig{
		APIKey:  apiKey,
		Encoder: encoder,
		Querier: store,
	})

	var history []*gptbot.Turn

	question := "When was GPT-3 released?"
	answer, err := bot.Chat(ctx, question, history...)
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}
	fmt.Printf("Q: %s\n", question)
	fmt.Printf("A: %s\n", answer)

	// Save the conversation history.
	history = append(history, &gptbot.Turn{
		Question: question,
		Answer:   answer,
	})

	question = "How many parameters does it use?" // In multi-turn mode, here "it" will be inferred to "GPT-3".
	answer, err = bot.Chat(ctx, question, history...)
	if err != nil {
		fmt.Printf("err: %v", err)
		return
	}
	fmt.Printf("Q: %s\n", question)
	fmt.Printf("A: %s\n", answer)

}

Output:


Q: When was GPT-3 released?
A: GPT-3 was released in 2020.
Q: How many parameters does it use?
A: GPT-3 uses 175 billion parameters.

Index ¶

Constants
type Bot
- func NewBot(cfg *BotConfig) *Bot
- func (b *Bot) Chat(ctx context.Context, question string, history ...*Turn) (string, error)
type BotConfig
type Chunk
type Document
type Embedding
type Encoder
type Feeder
- func NewFeeder(cfg *FeederConfig) *Feeder
- func (f *Feeder) Feed(ctx context.Context, docs ...*Document) error
type FeederConfig
type LocalVectorStore
- func NewLocalVectorStore() *LocalVectorStore
- func (vs *LocalVectorStore) Delete(ctx context.Context, documentIDs ...string) error
- func (vs *LocalVectorStore) GetAllData(ctx context.Context) map[string][]*Chunk
- func (vs *LocalVectorStore) Insert(ctx context.Context, chunks map[string][]*Chunk) error
- func (vs *LocalVectorStore) LoadJSON(ctx context.Context, filename string) error
- func (vs *LocalVectorStore) Query(ctx context.Context, embedding Embedding, topK int) ([]*Similarity, error)
- func (vs *LocalVectorStore) StoreJSON(filename string) error
type Metadata
type ModelType
type OpenAIEncoder
- func NewOpenAIEncoder(apiKey string, model string) *OpenAIEncoder
- func (e *OpenAIEncoder) Encode(ctx context.Context, text string) (Embedding, error)
- func (e *OpenAIEncoder) EncodeBatch(ctx context.Context, texts []string) ([]Embedding, error)
type Preprocessor
- func NewPreprocessor(cfg *PreprocessorConfig) *Preprocessor
- func (p *Preprocessor) Preprocess(docs ...*Document) (map[string][]*Chunk, error)
type PreprocessorConfig
type PromptData
type PromptTemplate
- func (p PromptTemplate) Render(data any) (string, error)
type Querier
type Similarity
type Turn
type Updater
type XPreprocessor

Constants ¶

View Source

const (
	DefaultPromptTmpl = `` /* 225-byte string literal not displayed */

	DefaultMultiTurnPromptTmpl = `` /* 786-byte string literal not displayed */

)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Bot ¶

type Bot struct {
	// contains filtered or unexported fields
}

func NewBot ¶

func NewBot(cfg *BotConfig) *Bot

func (*Bot) Chat ¶

func (b *Bot) Chat(ctx context.Context, question string, history ...*Turn) (string, error)

Chat answers the given question in single-turn mode by default. If non-empty history is specified, multi-turn mode will be enabled. See BotConfig.MultiTurnPromptTmpl for more details.

type BotConfig ¶

type BotConfig struct {
	// APIKey is the OpenAI's APIKey.
	// This field is required.
	APIKey string

	// Encoder is an Embedding Encoder, which will encode the user's question into a vector for similarity search.
	// This field is required.
	Encoder Encoder

	// Querier is a search engine, which is capable of doing the similarity search.
	// This field is required.
	Querier Querier

	// Model is the ID of OpenAI's model to use for chat.
	// Defaults to "gpt-3.5-turbo".
	Model ModelType

	// TopK specifies how many candidate similarities will be selected to construct the prompt.
	// Defaults to 3.
	TopK int

	// MaxTokens is the maximum number of tokens to generate in the chat.
	// Defaults to 256.
	MaxTokens int

	// PromptTmpl specifies a custom prompt template for single-turn mode.
	// Defaults to DefaultPromptTmpl.
	PromptTmpl string

	// MultiTurnPromptTmpl specifies a custom prompt template for multi-turn mode.
	// Defaults to DefaultMultiTurnPromptTmpl.
	//
	// Prompt-based question answering bot essentially operates in single-turn mode,
	// since the quality of each answer largely depends on the associated prompt context
	// (i.e. the most similar document chunks), which all depends on the corresponding
	// question rather than the conversation history.
	//
	// As a workaround, we try to achieve the effect of multi-turn mode by adding an
	// extra frontend agent, who can respond directly to the user for casual greetings,
	// and can refine incomplete questions according to the conversation history
	// before consulting the backend system (i.e. the single-turn Question Answering Bot).
	MultiTurnPromptTmpl string
}

type Chunk ¶

type Chunk struct {
	ID         string    `json:"id,omitempty"`
	Text       string    `json:"text,omitempty"`
	DocumentID string    `json:"document_id,omitempty"`
	Metadata   Metadata  `json:"metadata,omitempty"`
	Embedding  Embedding `json:"embedding,omitempty"`
}

type Document ¶

type Document struct {
	ID       string   `json:"id,omitempty"`
	Text     string   `json:"text,omitempty"`
	Metadata Metadata `json:"metadata,omitempty"`
}

type Embedding ¶

type Embedding []float64

type Encoder ¶

type Encoder interface {
	Encode(cxt context.Context, text string) (Embedding, error)
	EncodeBatch(cxt context.Context, texts []string) ([]Embedding, error)
}

type Feeder ¶

type Feeder struct {
	// contains filtered or unexported fields
}

func NewFeeder ¶

func NewFeeder(cfg *FeederConfig) *Feeder

func (*Feeder) Feed ¶

func (f *Feeder) Feed(ctx context.Context, docs ...*Document) error

type FeederConfig ¶

type FeederConfig struct {
	// Encoder is the embedding encoder.
	// This field is required.
	Encoder Encoder

	// Updater is the vector store for inserting/deleting chunks.
	// This field is required.
	Updater Updater

	// Defaults to NewPreprocessor(...).
	Preprocessor XPreprocessor

	// BatchSize is the number of chunks to encode/upsert at a time.
	// Defaults to 100.
	BatchSize int
}

type LocalVectorStore ¶

type LocalVectorStore struct {
	// contains filtered or unexported fields
}

func NewLocalVectorStore ¶

func NewLocalVectorStore() *LocalVectorStore

func (*LocalVectorStore) Delete ¶

func (vs *LocalVectorStore) Delete(ctx context.Context, documentIDs ...string) error

Delete deletes the chunks belonging to the given documentIDs. As a special case, empty documentIDs means deleting all chunks.

func (*LocalVectorStore) GetAllData ¶

func (vs *LocalVectorStore) GetAllData(ctx context.Context) map[string][]*Chunk

GetAllData returns all the internal data. It is mainly used for testing purpose.

func (*LocalVectorStore) Insert ¶

func (vs *LocalVectorStore) Insert(ctx context.Context, chunks map[string][]*Chunk) error

func (*LocalVectorStore) LoadJSON ¶

func (vs *LocalVectorStore) LoadJSON(ctx context.Context, filename string) error

LoadJSON will deserialize from disk into a `LocalVectorStore` based on the provided filename.

func (*LocalVectorStore) Query ¶

func (vs *LocalVectorStore) Query(ctx context.Context, embedding Embedding, topK int) ([]*Similarity, error)

func (*LocalVectorStore) StoreJSON ¶

func (vs *LocalVectorStore) StoreJSON(filename string) error

StoreJSON will serialize the `LocalVectorStore` to disk based on the provided filename.

type Metadata ¶

type Metadata struct{}

type ModelType ¶

type ModelType string

const (
	// GPT-4
	GPT4 ModelType = "gpt-4"
	// GPT-3.5
	GPT3Dot5Turbo  ModelType = "gpt-3.5-turbo"
	TextDavinci003 ModelType = "text-davinci-003"
	TextDavinci002 ModelType = "text-davinci-002"
	// GPT-3
	TextAda001     ModelType = "text-ada-001"
	TextCurie001   ModelType = "text-curie-001"
	TextBabbage001 ModelType = "text-babbage-001"
)

type OpenAIEncoder ¶

type OpenAIEncoder struct {
	// contains filtered or unexported fields
}

func NewOpenAIEncoder ¶

func NewOpenAIEncoder(apiKey string, model string) *OpenAIEncoder

func (*OpenAIEncoder) Encode ¶

func (e *OpenAIEncoder) Encode(ctx context.Context, text string) (Embedding, error)

func (*OpenAIEncoder) EncodeBatch ¶

func (e *OpenAIEncoder) EncodeBatch(ctx context.Context, texts []string) ([]Embedding, error)

type Preprocessor ¶

type Preprocessor struct {
	// contains filtered or unexported fields
}

Preprocessor splits a list of documents into chunks.

func NewPreprocessor ¶

func NewPreprocessor(cfg *PreprocessorConfig) *Preprocessor

func (*Preprocessor) Preprocess ¶

func (p *Preprocessor) Preprocess(docs ...*Document) (map[string][]*Chunk, error)

type PreprocessorConfig ¶

type PreprocessorConfig struct {
	// ChunkTokenNum is the number of tokens for each text chunk.
	// Defaults to 200.
	ChunkTokenNum int

	// MinChunkCharNum is the minimum number of characters for each text chunk.
	// Defaults to 350.
	MinChunkCharNum int

	// MinChunkLenToEmbed is the minimum length in characters.
	// Chunks with shorter length will be discarded.
	//
	// Defaults to 5.
	MinChunkLenToEmbed int

	// MaxChunkNum is the maximum number of chunks to generate from a text.
	// Defaults to 10000.
	MaxChunkNum int
}

type PromptData ¶

type PromptData struct {
	Question string
	Sections []string
}

type PromptTemplate ¶

type PromptTemplate string

func (PromptTemplate) Render ¶

func (p PromptTemplate) Render(data any) (string, error)

type Querier ¶

type Querier interface {
	Query(ctx context.Context, embedding Embedding, topK int) ([]*Similarity, error)
}

type Similarity ¶

type Similarity struct {
	*Chunk

	Score float64 `json:"score,omitempty"`
}

type Turn ¶

type Turn struct {
	Question string `json:"question,omitempty"`
	Answer   string `json:"answer,omitempty"`
}

Turn represents a round of dialogue.

type Updater ¶

type Updater interface {
	Insert(ctx context.Context, chunks map[string][]*Chunk) error
	Delete(ctx context.Context, documentIDs ...string) error
}

type XPreprocessor ¶

type XPreprocessor interface {
	Preprocess(docs ...*Document) (map[string][]*Chunk, error)
}

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cmd
gptbot command
milvus

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL