gpu-machine

自动安装脚本

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.cvm.v20170312 import cvm_client, models
try:
    # 实例化一个认证对象，入参需要传入腾讯云账户 SecretId 和 SecretKey，此处还需注意密钥对的保密
    # 代码泄露可能会导致 SecretId 和 SecretKey 泄露，并威胁账号下所有资源的安全性。以下代码示例仅供参考，建议采用更安全的方式来使用密钥，请参见：https://cloud.tencent.com/document/product/1278/85305
    # 密钥可前往官网控制台 https://console.cloud.tencent.com/cam/capi 进行获取
    cred = credential.Credential("$secretId", "$secretKey")
    # 实例化一个http选项，可选的，没有特殊需求可以跳过
    httpProfile = HttpProfile()
    httpProfile.endpoint = "cvm.ap-tokyo.tencentcloudapi.com"

    # 实例化一个client选项，可选的，没有特殊需求可以跳过
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    # 实例化要请求产品的client对象,clientProfile是可选的
    client = cvm_client.CvmClient(cred, "ap-tokyo", clientProfile)

    # 实例化一个请求对象,每个接口都会对应一个request对象
    req = models.RunInstancesRequest()
    params = {
        "InstanceChargeType": "SPOTPAID",
        "DisableApiTermination": False,
        "Placement": {
            "Zone": "ap-tokyo-2",
            "ProjectId": 0
        },
        "InstanceMarketOptions": {
            "SpotOptions": {
                "MaxPrice": "1000"
            }
        },
        "VirtualPrivateCloud": {
            "AsVpcGateway": False,
            "VpcId": "DEFAULT",
            "SubnetId": "DEFAULT"
        },
        "InstanceType": "GN7.2XLARGE32",
        "ImageId": "img-487zeit5",
        "SystemDisk": {
            "DiskSize": 100,
            "DiskType": "CLOUD_BSSD"
        },
        "InternetAccessible": {
            "InternetMaxBandwidthOut": 20,
            "PublicIpAssigned": True,
            "InternetChargeType": "TRAFFIC_POSTPAID_BY_HOUR"
        },
        "InstanceName": "gpt-tk",
        "LoginSettings": {
            "KeyIds": [ "skey-l0zbjr5d" ]
        },
        "InstanceCount": 1,
        "EnhancedService": {
            "SecurityService": {
                "Enabled": True
            },
            "MonitorService": {
                "Enabled": True
            },
            "AutomationService": {
                "Enabled": True
            }
        },
        "UserData": "IyEvYmluL2Jhc2gKICAgIGVjaG8gIgogICAgICAgICAgaW5zdGFsbF9kcml2ZXI9MQogICAgICAgICAgZHJpdmVyX3ZlcnNpb249TlZJRElBLUxpbnV4LXg4Nl82NC01MjUuMTA1LjE3LnJ1bgogICAgICAgIAogICAgICAgICAgaW5zdGFsbF9mYWJyaWNfbWFuYWdlcj0wCiAgICAgICAgCiAgICAgICAgICAgIGluc3RhbGxfY3VkYT0xCiAgICAgICAgICAgIGN1ZGFfdmVyc2lvbj1jdWRhXzEyLjAuMV81MjUuODUuMTJfbGludXgucnVuCiAgICAgICAgICAgIAogICAgICAgICAgICBpbnN0YWxsX2N1ZG5uPTEKICAgICAgICAgICAgY3Vkbm5fdmVyc2lvbjE9Y3Vkbm4tbGludXgteDg2XzY0LTguOC4wLjEyMV9jdWRhMTItYXJjaGl2ZS50YXIueHoKICAgICAgICAgICAgY3Vkbm5fdmVyc2lvbjI9CiAgICAgICAgICAgIGN1ZG5uX3ZlcnNpb24zPQogICAgICAgICAgICAKICAgICAgICAgIHRhY29fc2V0dXA9MAogICAgICAgICIgPj4gL3RtcC9udl9kcml2ZXJfaW5zdGFsbC5pbmkKICAgICAgICAgICAgICAgICAgICAgICAgICA="
    }
    req.from_json_string(json.dumps(params))

    # 返回的resp是一个RunInstancesResponse的实例，与请求对象对应
    resp = client.RunInstances(req)
    # 输出json格式的字符串回包
    print(resp.to_json_string())

except TencentCloudSDKException as err:
    print(err)

userData其实就是base64编码后的预安装脚本

#!/bin/bash
    echo "
          install_driver=1
          driver_version=NVIDIA-Linux-x86_64-525.105.17.run
        
          install_fabric_manager=0
        
            install_cuda=1
            cuda_version=cuda_12.0.1_525.85.12_linux.run
            
            install_cudnn=1
            cudnn_version1=cudnn-linux-x86_64-8.8.0.121_cuda12-archive.tar.xz
            cudnn_version2=
            cudnn_version3=
            
          taco_setup=0
        " >> /tmp/nv_driver_install.ini
                          

skey设置的则是个人密钥保存在腾讯云的标记, 不需暴露公钥.

loginSettings1.setKeyIds

java 自动安装

import com.tencentcloudapi.common.Credential;
import com.tencentcloudapi.common.profile.ClientProfile;
import com.tencentcloudapi.common.profile.HttpProfile;
import com.tencentcloudapi.common.exception.TencentCloudSDKException;
import com.tencentcloudapi.cvm.v20170312.CvmClient;
import com.tencentcloudapi.cvm.v20170312.models.*;

public class RunInstances
{
    public static void main(String [] args) {
        try{
            // 实例化一个认证对象，入参需要传入腾讯云账户 SecretId 和 SecretKey，此处还需注意密钥对的保密
            // 代码泄露可能会导致 SecretId 和 SecretKey 泄露，并威胁账号下所有资源的安全性。以下代码示例仅供参考，建议采用更安全的方式来使用密钥，请参见：https://cloud.tencent.com/document/product/1278/85305
            // 密钥可前往官网控制台 https://console.cloud.tencent.com/cam/capi 进行获取
            Credential cred = new Credential("$secretId", "$secretKey");
            // 实例化一个http选项，可选的，没有特殊需求可以跳过
            HttpProfile httpProfile = new HttpProfile();
            httpProfile.setEndpoint("cvm.ap-tokyo.tencentcloudapi.com");
            // 实例化一个client选项，可选的，没有特殊需求可以跳过
            ClientProfile clientProfile = new ClientProfile();
            clientProfile.setHttpProfile(httpProfile);
            // 实例化要请求产品的client对象,clientProfile是可选的
            CvmClient client = new CvmClient(cred, "ap-tokyo", clientProfile);
            // 实例化一个请求对象,每个接口都会对应一个request对象
            RunInstancesRequest req = new RunInstancesRequest();
            req.setInstanceChargeType("SPOTPAID");
            req.setDisableApiTermination(false);
            Placement placement1 = new Placement();
            placement1.setZone("ap-tokyo-2");
            placement1.setProjectId(0L);
            req.setPlacement(placement1);

            InstanceMarketOptionsRequest instanceMarketOptionsRequest1 = new InstanceMarketOptionsRequest();
            SpotMarketOptions spotMarketOptions1 = new SpotMarketOptions();
            spotMarketOptions1.setMaxPrice("1000");
            instanceMarketOptionsRequest1.setSpotOptions(spotMarketOptions1);

            req.setInstanceMarketOptions(instanceMarketOptionsRequest1);

            VirtualPrivateCloud virtualPrivateCloud1 = new VirtualPrivateCloud();
            virtualPrivateCloud1.setAsVpcGateway(false);
            virtualPrivateCloud1.setVpcId("DEFAULT");
            virtualPrivateCloud1.setSubnetId("DEFAULT");
            req.setVirtualPrivateCloud(virtualPrivateCloud1);

            req.setInstanceType("GN7.2XLARGE32");
            req.setImageId("img-487zeit5");
            SystemDisk systemDisk1 = new SystemDisk();
            systemDisk1.setDiskSize(100L);
            systemDisk1.setDiskType("CLOUD_BSSD");
            req.setSystemDisk(systemDisk1);

            InternetAccessible internetAccessible1 = new InternetAccessible();
            internetAccessible1.setInternetMaxBandwidthOut(20L);
            internetAccessible1.setPublicIpAssigned(true);
            internetAccessible1.setInternetChargeType("TRAFFIC_POSTPAID_BY_HOUR");
            req.setInternetAccessible(internetAccessible1);

            req.setInstanceName("gpt-tk");
            LoginSettings loginSettings1 = new LoginSettings();

            String[] keyIds1 = {"skey-l0zbjr5d"};
            loginSettings1.setKeyIds(keyIds1);

            req.setLoginSettings(loginSettings1);

            req.setInstanceCount(1L);
            EnhancedService enhancedService1 = new EnhancedService();
            RunSecurityServiceEnabled runSecurityServiceEnabled1 = new RunSecurityServiceEnabled();
            runSecurityServiceEnabled1.setEnabled(true);
            enhancedService1.setSecurityService(runSecurityServiceEnabled1);

            RunMonitorServiceEnabled runMonitorServiceEnabled1 = new RunMonitorServiceEnabled();
            runMonitorServiceEnabled1.setEnabled(true);
            enhancedService1.setMonitorService(runMonitorServiceEnabled1);

            RunAutomationServiceEnabled runAutomationServiceEnabled1 = new RunAutomationServiceEnabled();
            runAutomationServiceEnabled1.setEnabled(true);
            enhancedService1.setAutomationService(runAutomationServiceEnabled1);

            req.setEnhancedService(enhancedService1);

            req.setUserData("IyEvYmluL2Jhc2gKICAgIGVjaG8gIgogICAgICAgICAgaW5zdGFsbF9kcml2ZXI9MQogICAgICAgICAgZHJpdmVyX3ZlcnNpb249TlZJRElBLUxpbnV4LXg4Nl82NC01MjUuMTA1LjE3LnJ1bgogICAgICAgIAogICAgICAgICAgaW5zdGFsbF9mYWJyaWNfbWFuYWdlcj0wCiAgICAgICAgCiAgICAgICAgICAgIGluc3RhbGxfY3VkYT0xCiAgICAgICAgICAgIGN1ZGFfdmVyc2lvbj1jdWRhXzEyLjAuMV81MjUuODUuMTJfbGludXgucnVuCiAgICAgICAgICAgIAogICAgICAgICAgICBpbnN0YWxsX2N1ZG5uPTEKICAgICAgICAgICAgY3Vkbm5fdmVyc2lvbjE9Y3Vkbm4tbGludXgteDg2XzY0LTguOC4wLjEyMV9jdWRhMTItYXJjaGl2ZS50YXIueHoKICAgICAgICAgICAgY3Vkbm5fdmVyc2lvbjI9CiAgICAgICAgICAgIGN1ZG5uX3ZlcnNpb24zPQogICAgICAgICAgICAKICAgICAgICAgIHRhY29fc2V0dXA9MAogICAgICAgICIgPj4gL3RtcC9udl9kcml2ZXJfaW5zdGFsbC5pbmkKICAgICAgICAgICAgICAgICAgICAgICAgICA=");
            // 返回的resp是一个RunInstancesResponse的实例，与请求对象对应
            RunInstancesResponse resp = client.RunInstances(req);
            // 输出json格式的字符串回包
            System.out.println(RunInstancesResponse.toJsonString(resp));
        } catch (TencentCloudSDKException e) {
            System.out.println(e.toString());
        }
    }
}

从镜像安装

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.cvm.v20170312 import cvm_client, models
try:
    # 实例化一个认证对象，入参需要传入腾讯云账户 SecretId 和 SecretKey，此处还需注意密钥对的保密
    # 代码泄露可能会导致 SecretId 和 SecretKey 泄露，并威胁账号下所有资源的安全性。以下代码示例仅供参考，建议采用更安全的方式来使用密钥，请参见：https://cloud.tencent.com/document/product/1278/85305
    # 密钥可前往官网控制台 https://console.cloud.tencent.com/cam/capi 进行获取
    cred = credential.Credential("$secretId", "$secretKey")
    # 实例化一个http选项，可选的，没有特殊需求可以跳过
    httpProfile = HttpProfile()
    httpProfile.endpoint = "cvm.ap-tokyo.tencentcloudapi.com"

    # 实例化一个client选项，可选的，没有特殊需求可以跳过
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    # 实例化要请求产品的client对象,clientProfile是可选的
    client = cvm_client.CvmClient(cred, "ap-tokyo", clientProfile)

    # 实例化一个请求对象,每个接口都会对应一个request对象
    req = models.RunInstancesRequest()
    params = {
        "InstanceChargeType": "SPOTPAID",
        "DisableApiTermination": False,
        "Placement": {
            "Zone": "ap-tokyo-2",
            "ProjectId": 0
        },
        "InstanceMarketOptions": {
            "SpotOptions": {
                "MaxPrice": "1000"
            }
        },
        "VirtualPrivateCloud": {
            "AsVpcGateway": False,
            "VpcId": "vpc-gywmabvi",
            "SubnetId": "subnet-q6nol0g7",
            "Ipv6AddressCount": 0
        },
        "InstanceType": "GN7.2XLARGE32",
        "ImageId": "img-l9edrk6k",
        "SystemDisk": {
            "DiskSize": 100,
            "DiskType": "CLOUD_BSSD"
        },
        "InternetAccessible": {
            "InternetMaxBandwidthOut": 30,
            "PublicIpAssigned": True,
            "InternetChargeType": "TRAFFIC_POSTPAID_BY_HOUR"
        },
        "LoginSettings": {
            "KeyIds": [ "skey-ppht5ylz" ]
        },
        "SecurityGroupIds": [ "sg-2ggkxub0", "sg-nle8qa5c" ],
        "InstanceCount": 1,
        "EnhancedService": {
            "SecurityService": {
                "Enabled": True
            },
            "MonitorService": {
                "Enabled": True
            },
            "AutomationService": {
                "Enabled": False
            }
        },
        "HostName": "ocx"
    }
    req.from_json_string(json.dumps(params))

    # 返回的resp是一个RunInstancesResponse的实例，与请求对象对应
    resp = client.RunInstances(req)
    # 输出json格式的字符串回包
    print(resp.to_json_string())

except TencentCloudSDKException as err:
    print(err)

安装前的自动bash数据

#!/bin/bash
    echo "
          install_driver=1
          driver_version=NVIDIA-Linux-x86_64-525.105.17.run
        
          install_fabric_manager=0
        
            install_cuda=1
            cuda_version=cuda_12.0.1_525.85.12_linux.run
            
            install_cudnn=1
            cudnn_version1=cudnn-linux-x86_64-8.8.0.121_cuda12-archive.tar.xz
            cudnn_version2=
            cudnn_version3=
            
          taco_setup=0
        " >> /tmp/nv_driver_install.ini
                          

登录后的通知

云端机器无法直接使用, 会先下载依赖. 不知道制作为镜像后是否仍然有效, 后面测试下.

Hello, This script will download and install the GPU driver, CUDA, CUDNN library automatically, you can not suspend or stop it until completed.
1. The whole process will take about 15 to 25 minutes. During this time, please do not operate the GPU or install any GPU related software.

Driver install finished: NVIDIA-Linux-x86_64-525.105.17.run, the remaining installation needs about 14-19 minutes.

Installing cuda: cuda_12.0.1_525.85.12_linux.run, it will take about 5 minutes. the remaining installation needs about 12-16 minutes.
[####################################################################################################] 100% /

Downloading cudnn: cudnn-linux-x86_64-8.8.0.121_cuda12-archive.tar.xz, it will take about 2 minutes. the remaining installation needs about 4-7 minutes.
[####################################################################################################] 100% /

Installing cudnn: cudnn-linux-x86_64-8.8.0.121_cuda12-archive.tar.xz, it will take about 2 minutes. the remaining installation needs about 2-3 minutes.
[####################################################################################################] 100% /

All install OK! Enjoy it!

步骤

ubuntu@ocx:/data$ sudo chown ubuntu.ubuntu /data
ubuntu@ocx:/data$ mkdir /data/projects
ubuntu@ocx:/data$ cd /data/projects/

腾讯云竞价实例

竞价实例常见问题

https://cloud.tencent.com/document/product/213/17817

系统中断前2分钟通过 Metadata（实例元数据）的方式来通知您即将中断回收该实例.
按秒收费, 而不是按完整1个小时收费
每个可用区最多可以有50核的竞价实例

元数据查询实例触发回收时间

https://cloud.tencent.com/document/product/213/37970

系统中断前2分钟通过 Metadata（实例元数据）的方式来通知您即将中断回收该实例.

通过 Metadata 获取竞价实例回收状态信息, 需要在实例内部通过http请求获取. 如果返回404，则表示该实例非竞价实例或还未触发回收。

curl metadata.tencentyun.com/latest/meta-data/spot/termination-time

返回类似如下信息，则表示为竞价实例回收时间, 时区标准为 UTC +8。

2018-08-18 12:05:33

如果返回404，则表示该实例非竞价实例或还未触发回收。

机器元数据访问

https://cloud.tencent.com/document/product/213/4934

不只是查询实例到期时间, 还可以获取不少机器信息, 包括用户基础信息

在实例内部可以通过实例元数据访问实例本地 IP、公网 IP 等数据以管理与外部应用程序的连接。

直接访问, 返回可以查询的endpoint

ubuntu@ocx:~$ curl http://metadata.tencentyun.com/latest/meta-data/

app-id
instance-id
local-ipv4
mac
public-ipv4
uuid
instance-name
hostname
runcmd
placement/
payment/
network/
volumes/
instance/
public-keys/
ntp/
rdma-subnet

curl http://metadata.tencentyun.com/latest/meta-data/payment/charge-type
SPOT

网络带宽规则

https://cloud.tencent.com/document/product/213/12523#.E5.85.A5.E7.BD.91.E5.B8.A6.E5.AE.BD.E4.B8.8A.E9.99.90.EF.BC.88.E4.B8.8A.E8.A1.8C.E5.B8.A6.E5.AE.BD.EF.BC.89

入网带宽上限（上行带宽）

公网的入网带宽是指流入云服务器实例的带宽。

按流量计费、按固定带宽计费（包括按小时带宽和包月带宽）的公网 IP：
- 用户购买的带宽小于等于10Mbps时，腾讯云会分配10Mbps外网入方向带宽。
- 用户购买的带宽大于10Mbps时，腾讯云会分配与购买的带宽相等的外网入方向带宽。
按共享带宽包计费的公网 IP：腾讯云会分配与购买的带宽相等的外网入方向带宽。

chat

https://github.com/THUDM/ChatGLM2-6B

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 214.00 MiB (GPU 0; 15.57 GiB total capacity; 576.02 MiB already allocated; 27.19 MiB free; 578.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

自动安装脚本​

java 自动安装​

从镜像安装​

安装前的自动bash数据​

登录后的通知​

步骤​

腾讯云竞价实例​

竞价实例常见问题​

元数据查询实例触发回收时间​

机器元数据访问​

网络带宽规则​

chat​