devices

package
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 6, 2025 License: Apache-2.0 Imports: 12 Imported by: 0

Documentation

Overview

Copyright 2024-2025.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	NVMLWarpSize = 32
)

Variables

This section is empty.

Functions

func RegisterMockDevice

func RegisterMockDevice()

func SetRegistry

func SetRegistry(registry *Registry)

SetRegistry replaces the global registry instance NOTE: All plugins will need to be manually registered after this function is called.

func TranslateGPUToArch

func TranslateGPUToArch(productName string) string

Translate product name to GFX architecture

Types

type AMDASIC

type AMDASIC struct {
	MarketName            string      `json:"market_name"`
	VendorID              string      `json:"vendor_id"`
	VendorName            string      `json:"vendor_name"`
	SubvendorID           string      `json:"subvendor_id"`
	DeviceID              string      `json:"device_id"`
	SubsystemID           string      `json:"subsystem_id"`
	RevID                 string      `json:"rev_id"`
	ASICSerial            string      `json:"asic_serial"`
	OAMID                 interface{} `json:"oam_id"`
	NumComputeUnits       int         `json:"num_compute_units"`
	TargetGraphicsVersion string      `json:"target_graphics_version"`
}

type AMDBoard

type AMDBoard struct {
	ModelNumber      string `json:"model_number"`
	ProductSerial    string `json:"product_serial"`
	FRUID            string `json:"fru_id"`
	ProductName      string `json:"product_name"`
	ManufacturerName string `json:"manufacturer_name"`
}

type AMDBus

type AMDBus struct {
	BDF                  string `json:"bdf"`
	MaxPCIeWidth         int    `json:"max_pcie_width"`
	PCIeInterfaceVersion string `json:"pcie_interface_version"`
	SlotType             string `json:"slot_type"`
}

type AMDCacheInfo

type AMDCacheInfo struct {
	Cache            int          `json:"cache"`
	CacheProperties  []string     `json:"cache_properties"`
	CacheSize        AMDUnitValue `json:"cache_size"`
	CacheLevel       int          `json:"cache_level"`
	MaxNumCUShared   int          `json:"max_num_cu_shared"`
	NumCacheInstance int          `json:"num_cache_instance"`
}

type AMDCardInfo

type AMDCardInfo struct {
	GPU              int            `json:"gpu"`
	ASIC             AMDASIC        `json:"asic"`
	Bus              AMDBus         `json:"bus"`
	VBIOS            AMDVBIOS       `json:"vbios"`
	Driver           AMDDriver      `json:"driver"`
	Board            AMDBoard       `json:"board"`
	RAS              AMDRAS         `json:"ras"`
	Partition        AMDPartition   `json:"partition"`
	SOCPState        string         `json:"soc_pstate"`
	XGMIPlpd         AMDXGMIPlpd    `json:"xgmi_plpd"`
	ProcessIsolation string         `json:"process_isolation"`
	NUMA             AMDNUMA        `json:"numa"`
	VRAM             AMDVRAM        `json:"vram"`
	CacheInfo        []AMDCacheInfo `json:"cache_info"`
}

type AMDDriver

type AMDDriver struct {
	Name    string `json:"name"`
	Version string `json:"version"`
}

type AMDGPUInfo

type AMDGPUInfo struct {
	GPUInfo  map[int]*AMDCardInfo
	ListInfo map[int]*AMDListInfo
}

type AMDListInfo

type AMDListInfo struct {
	GPU         int    `json:"gpu"`
	BDF         string `json:"bdf"`
	UniqueID    string `json:"uuid"`
	KFDID       int    `json:"kfd_id"`
	NodeID      int    `json:"node_id"`
	PartitionID int    `json:"partition_id"`
}

type AMDNUMA

type AMDNUMA struct {
	Node     int `json:"node"`
	Affinity int `json:"affinity"`
}

type AMDPLPD

type AMDPLPD struct {
	PolicyID          int    `json:"policy_id"`
	PolicyDescription string `json:"policy_description"`
}

type AMDPartition

type AMDPartition struct {
	ComputePartition string `json:"compute_partition"`
	MemoryPartition  string `json:"memory_partition"`
	PartitionID      int    `json:"partition_id"`
}

type AMDRAS

type AMDRAS struct {
	EEPROMVersion   string            `json:"eeprom_version"`
	ParitySchema    string            `json:"parity_schema"`
	SingleBitSchema string            `json:"single_bit_schema"`
	DoubleBitSchema string            `json:"double_bit_schema"`
	PoisonSchema    string            `json:"poison_schema"`
	ECCBlockState   map[string]string `json:"ecc_block_state"`
}

type AMDUnitValue

type AMDUnitValue struct {
	Value float64 `json:"value"`
	Unit  string  `json:"unit"`
}

type AMDVBIOS

type AMDVBIOS struct {
	Name       string `json:"name"`
	BuildDate  string `json:"build_date"`
	PartNumber string `json:"part_number"`
	Version    string `json:"version"`
}

type AMDVRAM

type AMDVRAM struct {
	Type     string      `json:"type"`
	Vendor   string      `json:"vendor"`
	Size     AMDVRAMSize `json:"size"`
	BitWidth int         `json:"bit_width"`
}

type AMDVRAMSize

type AMDVRAMSize struct {
	Value int    `json:"value"`
	Unit  string `json:"unit"`
}

type AMDXGMIPlpd

type AMDXGMIPlpd struct {
	NumSupported int       `json:"num_supported"`
	CurrentID    int       `json:"current_id"`
	PLPDs        []AMDPLPD `json:"plpds"`
}

type Device

type Device interface {
	// Name returns the name of the device
	Name() string
	// DevType returns the type of the device (nvml, ...)
	DevType() DeviceType
	// GetHwType returns the type of hw the device is (gpu, processor)
	HwType() string
	// InitLib the external library loading, if any.
	InitLib() error
	// Init initizalizes and start the metric device
	Init() error
	// Shutdown stops the metric device
	Shutdown() bool
	// GetGPUInfo returns the triton info for a specific GPU
	GetGPUInfo(gpuID int) (TritonGPUInfo, error)
	// GetAllGPUInfo returns the triton info for a all GPUs on the host
	GetAllGPUInfo() ([]TritonGPUInfo, error)
}

func MockDeviceDeviceStartup

func MockDeviceDeviceStartup() Device

func Startup

func Startup(a string) Device

Startup initializes and returns a new Device according to the given DeviceType [NVML|OTHER].

type DeviceType

type DeviceType int
const (
	MOCK DeviceType = iota
	AMD
	NVML
	ROCM
)

func (DeviceType) String

func (d DeviceType) String() string

type GPUDevice

type GPUDevice struct {
	ID         int
	TritonInfo TritonGPUInfo
}

type MockDevice

type MockDevice struct {
	// contains filtered or unexported fields
}

func (*MockDevice) DevType

func (d *MockDevice) DevType() DeviceType

func (*MockDevice) GetAllGPUInfo

func (d *MockDevice) GetAllGPUInfo() ([]TritonGPUInfo, error)

func (*MockDevice) GetGPUInfo

func (d *MockDevice) GetGPUInfo(gpuID int) (TritonGPUInfo, error)

func (*MockDevice) HwType

func (d *MockDevice) HwType() string

func (*MockDevice) Init

func (d *MockDevice) Init() error

func (*MockDevice) InitLib

func (d *MockDevice) InitLib() error

func (*MockDevice) Name

func (d *MockDevice) Name() string

func (*MockDevice) Shutdown

func (d *MockDevice) Shutdown() bool

type ROCMCardInfo

type ROCMCardInfo struct {
	UniqueID           string `json:"Unique ID"`
	SerialNumber       string `json:"Serial Number"`
	VRAMTotalMemory    string `json:"VRAM Total Memory (B)"`
	VRAMUsedMemory     string `json:"VRAM Total Used Memory (B)"`
	VISVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
	VISVRAMUsedMemory  string `json:"VIS_VRAM Total Used Memory (B)"`
	GTTTotalMemory     string `json:"GTT Total Memory (B)"`
	GTTUsedMemory      string `json:"GTT Total Used Memory (B)"`
	CardSeries         string `json:"Card Series"`
	CardModel          string `json:"Card Model"`
	CardVendor         string `json:"Card Vendor"`
	CardSKU            string `json:"Card SKU"`
	SubsystemID        string `json:"Subsystem ID"`
	DeviceRev          string `json:"Device Rev"`
	NodeID             string `json:"Node ID"`
	GUID               string `json:"GUID"`
	GFXVersion         string `json:"GFX Version"`
}

type ROCMGPUInfo

type ROCMGPUInfo struct {
	GPUInfo map[int]*ROCMCardInfo
	DrvInfo *ROCMSystemInfo
}

type ROCMSystemInfo

type ROCMSystemInfo struct {
	System struct {
		DriverVersion string `json:"Driver version"`
	} `json:"system"`
}

type Registry

type Registry struct {
	Registry map[string]map[DeviceType]deviceStartupFunc // Static map of supported Devices Startup functions
}

func GetRegistry

func GetRegistry() *Registry

Registry gets the default device Registry instance

func (*Registry) GetAllDeviceTypes

func (r *Registry) GetAllDeviceTypes() []string

GetAllDeviceTypes returns a slice with all the registered devices.

func (*Registry) MustRegister

func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc)

func (*Registry) Unregister

func (r *Registry) Unregister(d DeviceType)

type TritonGPUInfo

type TritonGPUInfo struct {
	// Name represents the model name of the GPU (e.g., "Tesla V100", "RTX 3090", "Radeon RX 6900 XT").
	// This field is universal for all GPUs.
	Name string

	// UUID represents the unique identifier of the GPU, useful for distinguishing multiple GPUs.
	// This field is also universal.
	UUID string

	// ComputeCapability reflects the GPU's compute capability. For CUDA GPUs, it's a string
	// (e.g., "6.1" for Volta), while for ROCm GPUs, it might be represented by a version number
	// or a similar capability specifier. It could be used to identify supported instruction sets
	// and hardware features.
	ComputeCapability string

	// Arch is a numerical representation of the GPU's architecture. For CUDA GPUs, it's a specific
	// number (e.g., 60 for Maxwell, 70 for Volta), while ROCm might have a different notation
	// (e.g., "gfx906" for Vega GPUs).
	Arch string

	// WarpSize reflects the number of threads in a single warp on the GPU.
	// This field would be applicable to CUDA GPUs (usually 32 threads per warp), but might be
	// handled differently in ROCm, as AMD uses wavefronts, which might not have a direct one-to-one mapping.
	WarpSize int

	// MemoryTotalMB represents the total amount of memory available on the GPU in megabytes.
	// This is common for both NVIDIA and AMD GPUs.
	MemoryTotalMB uint64

	// PTXVersion indicates the PTX version used for NVIDIA CUDA GPUs.
	// For ROCm, this would be replaced with a similar field for the intermediate language (e.g., HIP).
	PTXVersion int

	Backend string
}

TritonGPUInfo holds key GPU fields relevant to Triton cache validation It now supports both NVIDIA (CUDA) and AMD (ROCm) GPUs.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL