kind: DeploymentapiVersion: apps/v1metadata:name: qgpu-schedulernamespace: kube-systemspec:replicas: 1selector:matchLabels:app: qgpu-schedulertemplate:metadata:labels:app: qgpu-schedulerannotations:scheduler.alpha.kubernetes.io/critical-pod: ''spec:hostNetwork: truetolerations:- effect: NoScheduleoperator: Existskey: node-role.kubernetes.io/masterserviceAccount: qgpu-schedulercontainers:- name: qgpu-schedulerimage: ccr.ccs.tencentyun.com/lionelxchen/mixed-scheduler:v61command: ["qgpu-scheduler", "--priority=binpack"]env:- name: PORTvalue: "12345"resources:limits:memory: "800Mi"cpu: "1"requests:memory: "800Mi"cpu: "1"---apiVersion: v1kind: Servicemetadata:name: qgpu-schedulernamespace: kube-systemlabels:app: qgpu-schedulerspec:ports:- port: 12345name: httptargetPort: 12345selector:app: qgpu-scheduler---kind: ClusterRoleapiVersion: rbac.authorization.k8s.io/v1metadata:name: qgpu-schedulerrules:- apiGroups:- ""resources:- nodesverbs:- get- list- watch- apiGroups:- ""resources:- eventsverbs:- create- patch- apiGroups:- ""resources:- podsverbs:- update- patch- get- list- watch- apiGroups:- ""resources:- bindings- pods/bindingverbs:- create- apiGroups:- ""resources:- configmapsverbs:- get- list- watch---apiVersion: v1kind: ServiceAccountmetadata:name: qgpu-schedulernamespace: kube-system---kind: ClusterRoleBindingapiVersion: rbac.authorization.k8s.io/v1metadata:name: qgpu-schedulernamespace: kube-systemroleRef:apiGroup: rbac.authorization.k8s.iokind: ClusterRolename: qgpu-schedulersubjects:- kind: ServiceAccountname: qgpu-schedulernamespace: kube-system`
apiVersion: apps/v1kind: DaemonSetmetadata:name: qgpu-managernamespace: kube-systemspec:selector:matchLabels:app: qgpu-managertemplate:metadata:annotations:scheduler.alpha.kubernetes.io/critical-pod: ""labels:app: qgpu-managerspec:serviceAccount: qgpu-managerhostNetwork: truenodeSelector:qgpu-device-enable: "enable"initContainers:- name: qgpu-installerimage: ccr.ccs.tencentyun.com/lionelxchen/mixed-manager:v27command: ["/usr/bin/install.sh"]securityContext:privileged: truevolumeMounts:- name: host-rootmountPath: /hostcontainers:- image: ccr.ccs.tencentyun.com/lionelxchen/mixed-manager:v27command: ["/usr/bin/qgpu-manager", "--nodename=$(NODE_NAME)", "--dbfile=/host/var/lib/qgpu/meta.db"]name: qgpu-managerresources:limits:memory: "300Mi"cpu: "1"requests:memory: "300Mi"cpu: "1"env:- name: KUBECONFIGvalue: /etc/kubernetes/kubelet.conf- name: NODE_NAMEvalueFrom:fieldRef:fieldPath: spec.nodeNamesecurityContext:privileged: truevolumeMounts:- name: device-pluginmountPath: /var/lib/kubelet/device-plugins- name: pod-resourcesmountPath: /var/lib/kubelet/pod-resources- name: host-varmountPath: /host/var- name: host-devmountPath: /host/devvolumes:- name: device-pluginhostPath:path: /var/lib/kubelet/device-plugins- name: pod-resourceshostPath:path: /var/lib/kubelet/pod-resources- name: host-varhostPath:type: Directorypath: /var- name: host-devhostPath:type: Directorypath: /dev- name: host-roothostPath:type: Directorypath: /---kind: ClusterRoleapiVersion: rbac.authorization.k8s.io/v1metadata:name: qgpu-managerrules:- apiGroups:- ""resources:- "*"verbs:- get- list- watch- apiGroups:- ""resources:- eventsverbs:- create- patch- apiGroups:- ""resources:- podsverbs:- update- patch- get- list- watch- apiGroups:- ""resources:- nodes/statusverbs:- patch- update---apiVersion: v1kind: ServiceAccountmetadata:name: qgpu-managernamespace: kube-system---kind: ClusterRoleBindingapiVersion: rbac.authorization.k8s.io/v1metadata:name: qgpu-managernamespace: kube-systemroleRef:apiGroup: rbac.authorization.k8s.iokind: ClusterRolename: qgpu-managersubjects:- kind: ServiceAccountname: qgpu-managernamespace: kube-system
tke.cloud.tencent.com/app-class: offline
标识是一个离线 Pod,通过tke.cloud.tencent.com/qgpu-core-greedy
申请离线算力,需要注意的是,离线 Pod 不支持多卡,申请的算力必须小于等于100。apiVersion: v1kind: Podannotations:tke.cloud.tencent.com/app-class: offlinespec:containers:- name: offline-containerresources:requests:tke.cloud.tencent.com/qgpu-core-greedy: xx # 离线算力tke.cloud.tencent.com/qgpu-memory: xx
tke.cloud.tencent.com/app-class: online
标识是一个在线 Pod,不需要申请算力,只需要申请显存。apiVersion: v1kind: Podannotations:tke.cloud.tencent.com/app-class: onlinespec:containers:- name: online-containerresources:requests:tke.cloud.tencent.com/qgpu-memory: xx
tke.cloud.tencent.com/app-class
这个 Annotation,普通 Pod 支持多卡。apiVersion: v1kind: Podspec:containers:- name: common-containerresources:requests:tke.cloud.tencent.com/qgpu-core: xxtke.cloud.tencent.com/qgpu-memory: xx
本页内容是否解决了您的问题?