Documentation
¶
Overview ¶
Copyright 2024-2025.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Copyright 2021-2025 ¶
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Index ¶
- Constants
- func RegisterMockDevice()
- func SetRegistry(registry *Registry)
- func TranslateGPUToArch(productName string) string
- type AMDASIC
- type AMDBoard
- type AMDBus
- type AMDCacheInfo
- type AMDCardInfo
- type AMDDriver
- type AMDGPUInfo
- type AMDListInfo
- type AMDNUMA
- type AMDPLPD
- type AMDPartition
- type AMDRAS
- type AMDUnitValue
- type AMDVBIOS
- type AMDVRAM
- type AMDVRAMSize
- type AMDXGMIPlpd
- type Device
- type DeviceType
- type GPUDevice
- type MockDevice
- func (d *MockDevice) DevType() DeviceType
- func (d *MockDevice) GetAllGPUInfo() ([]TritonGPUInfo, error)
- func (d *MockDevice) GetGPUInfo(gpuID int) (TritonGPUInfo, error)
- func (d *MockDevice) HwType() string
- func (d *MockDevice) Init() error
- func (d *MockDevice) InitLib() error
- func (d *MockDevice) Name() string
- func (d *MockDevice) Shutdown() bool
- type ROCMCardInfo
- type ROCMGPUInfo
- type ROCMSystemInfo
- type Registry
- type TritonGPUInfo
Constants ¶
const (
NVMLWarpSize = 32
)
Variables ¶
This section is empty.
Functions ¶
func RegisterMockDevice ¶
func RegisterMockDevice()
func SetRegistry ¶
func SetRegistry(registry *Registry)
SetRegistry replaces the global registry instance NOTE: All plugins will need to be manually registered after this function is called.
func TranslateGPUToArch ¶
Translate product name to GFX architecture
Types ¶
type AMDASIC ¶
type AMDASIC struct {
MarketName string `json:"market_name"`
VendorID string `json:"vendor_id"`
VendorName string `json:"vendor_name"`
SubvendorID string `json:"subvendor_id"`
DeviceID string `json:"device_id"`
SubsystemID string `json:"subsystem_id"`
RevID string `json:"rev_id"`
ASICSerial string `json:"asic_serial"`
OAMID interface{} `json:"oam_id"`
NumComputeUnits int `json:"num_compute_units"`
TargetGraphicsVersion string `json:"target_graphics_version"`
}
type AMDCacheInfo ¶
type AMDCacheInfo struct {
Cache int `json:"cache"`
CacheProperties []string `json:"cache_properties"`
CacheSize AMDUnitValue `json:"cache_size"`
CacheLevel int `json:"cache_level"`
NumCacheInstance int `json:"num_cache_instance"`
}
type AMDCardInfo ¶
type AMDCardInfo struct {
GPU int `json:"gpu"`
ASIC AMDASIC `json:"asic"`
Bus AMDBus `json:"bus"`
VBIOS AMDVBIOS `json:"vbios"`
Driver AMDDriver `json:"driver"`
Board AMDBoard `json:"board"`
RAS AMDRAS `json:"ras"`
Partition AMDPartition `json:"partition"`
SOCPState string `json:"soc_pstate"`
XGMIPlpd AMDXGMIPlpd `json:"xgmi_plpd"`
ProcessIsolation string `json:"process_isolation"`
NUMA AMDNUMA `json:"numa"`
VRAM AMDVRAM `json:"vram"`
CacheInfo []AMDCacheInfo `json:"cache_info"`
}
type AMDGPUInfo ¶
type AMDGPUInfo struct {
GPUInfo map[int]*AMDCardInfo
ListInfo map[int]*AMDListInfo
}
type AMDListInfo ¶
type AMDPartition ¶
type AMDRAS ¶
type AMDRAS struct {
EEPROMVersion string `json:"eeprom_version"`
ParitySchema string `json:"parity_schema"`
SingleBitSchema string `json:"single_bit_schema"`
DoubleBitSchema string `json:"double_bit_schema"`
PoisonSchema string `json:"poison_schema"`
ECCBlockState map[string]string `json:"ecc_block_state"`
}
type AMDUnitValue ¶
type AMDVRAM ¶
type AMDVRAM struct {
Type string `json:"type"`
Vendor string `json:"vendor"`
Size AMDVRAMSize `json:"size"`
BitWidth int `json:"bit_width"`
}
type AMDVRAMSize ¶
type AMDXGMIPlpd ¶
type Device ¶
type Device interface {
// Name returns the name of the device
Name() string
// DevType returns the type of the device (nvml, ...)
DevType() DeviceType
// GetHwType returns the type of hw the device is (gpu, processor)
HwType() string
// InitLib the external library loading, if any.
InitLib() error
// Init initizalizes and start the metric device
Init() error
// Shutdown stops the metric device
Shutdown() bool
// GetGPUInfo returns the triton info for a specific GPU
GetGPUInfo(gpuID int) (TritonGPUInfo, error)
// GetAllGPUInfo returns the triton info for a all GPUs on the host
GetAllGPUInfo() ([]TritonGPUInfo, error)
}
func MockDeviceDeviceStartup ¶
func MockDeviceDeviceStartup() Device
type DeviceType ¶
type DeviceType int
const ( MOCK DeviceType = iota AMD NVML ROCM )
func (DeviceType) String ¶
func (d DeviceType) String() string
type GPUDevice ¶
type GPUDevice struct {
ID int
TritonInfo TritonGPUInfo
}
type MockDevice ¶
type MockDevice struct {
// contains filtered or unexported fields
}
func (*MockDevice) DevType ¶
func (d *MockDevice) DevType() DeviceType
func (*MockDevice) GetAllGPUInfo ¶
func (d *MockDevice) GetAllGPUInfo() ([]TritonGPUInfo, error)
func (*MockDevice) GetGPUInfo ¶
func (d *MockDevice) GetGPUInfo(gpuID int) (TritonGPUInfo, error)
func (*MockDevice) HwType ¶
func (d *MockDevice) HwType() string
func (*MockDevice) Init ¶
func (d *MockDevice) Init() error
func (*MockDevice) InitLib ¶
func (d *MockDevice) InitLib() error
func (*MockDevice) Name ¶
func (d *MockDevice) Name() string
func (*MockDevice) Shutdown ¶
func (d *MockDevice) Shutdown() bool
type ROCMCardInfo ¶
type ROCMCardInfo struct {
UniqueID string `json:"Unique ID"`
SerialNumber string `json:"Serial Number"`
VRAMTotalMemory string `json:"VRAM Total Memory (B)"`
VRAMUsedMemory string `json:"VRAM Total Used Memory (B)"`
VISVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
VISVRAMUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"`
GTTTotalMemory string `json:"GTT Total Memory (B)"`
GTTUsedMemory string `json:"GTT Total Used Memory (B)"`
CardSeries string `json:"Card Series"`
CardModel string `json:"Card Model"`
CardVendor string `json:"Card Vendor"`
CardSKU string `json:"Card SKU"`
SubsystemID string `json:"Subsystem ID"`
DeviceRev string `json:"Device Rev"`
NodeID string `json:"Node ID"`
GUID string `json:"GUID"`
GFXVersion string `json:"GFX Version"`
}
type ROCMGPUInfo ¶
type ROCMGPUInfo struct {
GPUInfo map[int]*ROCMCardInfo
DrvInfo *ROCMSystemInfo
}
type ROCMSystemInfo ¶
type ROCMSystemInfo struct {
System struct {
DriverVersion string `json:"Driver version"`
} `json:"system"`
}
type Registry ¶
type Registry struct {
Registry map[string]map[DeviceType]deviceStartupFunc // Static map of supported Devices Startup functions
}
func (*Registry) GetAllDeviceTypes ¶
GetAllDeviceTypes returns a slice with all the registered devices.
func (*Registry) MustRegister ¶
func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc)
func (*Registry) Unregister ¶
func (r *Registry) Unregister(d DeviceType)
type TritonGPUInfo ¶
type TritonGPUInfo struct {
// Name represents the model name of the GPU (e.g., "Tesla V100", "RTX 3090", "Radeon RX 6900 XT").
// This field is universal for all GPUs.
Name string
// UUID represents the unique identifier of the GPU, useful for distinguishing multiple GPUs.
// This field is also universal.
UUID string
// ComputeCapability reflects the GPU's compute capability. For CUDA GPUs, it's a string
// (e.g., "6.1" for Volta), while for ROCm GPUs, it might be represented by a version number
// or a similar capability specifier. It could be used to identify supported instruction sets
// and hardware features.
ComputeCapability string
// Arch is a numerical representation of the GPU's architecture. For CUDA GPUs, it's a specific
// number (e.g., 60 for Maxwell, 70 for Volta), while ROCm might have a different notation
// (e.g., "gfx906" for Vega GPUs).
Arch string
// WarpSize reflects the number of threads in a single warp on the GPU.
// This field would be applicable to CUDA GPUs (usually 32 threads per warp), but might be
// handled differently in ROCm, as AMD uses wavefronts, which might not have a direct one-to-one mapping.
WarpSize int
// MemoryTotalMB represents the total amount of memory available on the GPU in megabytes.
// This is common for both NVIDIA and AMD GPUs.
MemoryTotalMB uint64
// PTXVersion indicates the PTX version used for NVIDIA CUDA GPUs.
// For ROCm, this would be replaced with a similar field for the intermediate language (e.g., HIP).
PTXVersion int
Backend string
}
TritonGPUInfo holds key GPU fields relevant to Triton cache validation It now supports both NVIDIA (CUDA) and AMD (ROCm) GPUs.