~/.ssh/config
中,配置服务器的别名。本文创建别名为 tcg
。ssh-copy-id
命令,将本机 SSH 公钥复制至 GPU 云服务器。echo 'PasswordAuthentication no' | sudo tee -a /etc/ssh/ssh\\_config
sudo systemctl restart sshd
sudo apt install nvidia-driver-418
nvidia-smi
wget https://repo.anaconda.com/miniconda/Miniconda3-py39\\_4.11.0-Linux-x86\\_64.sh
chmod +x Miniconda3-py39\\_4.11.0-Linux-x86\\_64.sh
./Miniconda3-py39\\_4.11.0-Linux-x86\\_64.sh
rm Miniconda3-py39\\_4.11.0-Linux-x86\\_64.sh
~/.condarc
文件,加入以下软件源信息,将 conda 的软件源替换为清华源。channels:- defaultsshow\\_channel\\_urls: truedefault\\_channels:- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2custom\\_channels:conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudmsys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudbioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudmenpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudpytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudpytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloudsimpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
pip config set global.index-url https://mirrors.cloud.tencent.com/pypi/simple
conda install pytorch torchvision cudatoolkit=11.4 -c pytorch --yes
python
import torch
print(torch.cuda.is_avaliable())
# split data into train set and validation set, train:val=scaleimport shutilimport osimport mathscale = 4data\\_path = '../raw'data\\_dst = '../train\\_val'#create imagenet directory structureos.mkdir(data\\_dst)os.mkdir(os.path.join(data\\_dst, 'train'))os.mkdir(os.path.join(data\\_dst, 'validation'))for item in os.listdir(data\\_path):item\\_path = os.path.join(data\\_path, item)if os.path.isdir(item\\_path):train\\_dst = os.path.join(data\\_dst, 'train', item)val\\_dst = os.path.join(data\\_dst, 'validation', item)os.mkdir(train\\_dst)os.mkdir(val\\_dst)files = os.listdir(item\\_path)print(f'Class {item}:\\n\\t Total sample count is {len(files)}')split\\_idx = math.floor(len(files) \\* scale / ( 1 + scale ))print(f'\\t Train sample count is {split\\_idx}')print(f'\\t Val sample count is {len(files) - split\\_idx}\\n')for idx, file in enumerate(files):file\\_path = os.path.join(item\\_path, file)if idx <= split\\_idx:shutil.copy(file\\_path, train\\_dst)else:shutil.copy(file\\_path, val\\_dst)print(f'Split Complete. File path: {data\\_dst}')
Class roses:Total sample count is 641Train sample count is 512Validation sample count is 129Class sunflowers:Total sample count is 699Train sample count is 559Validation sample count is 140Class tulips:Total sample count is 799Train sample count is 639Validation sample count is 160Class daisy:Total sample count is 633Train sample count is 506Validation sample count is 127Class dandelion:Total sample count is 898Train sample count is 718Validation sample count is 180
git clone https://github.com/ver217/imagenet-tools.gitcd imagenet-tools && python3 make\\_tfrecords.py \\--raw\\_data\\_dir="../train\\_val" \\--local\\_scratch\\_dir="../train\\_val\\_tfrecord" && \\python3 make\\_idx.py --tfrecord\\_root="../train\\_val\\_tfrecord"
vit\\_tiny\\_patch16\\_224
模型,该模型的分辨率为224*224, 每个样本被划分为16个 patch
。pip install colossalai==0.1.5+torch1.11cu11.3 -f https://release.colossalai.org
pip install timm
from pathlib import Pathfrom colossalai.logging import get\\_dist\\_loggerimport colossalaiimport torchimport osfrom colossalai.core import global\\_context as gpcfrom colossalai.utils import get\\_dataloader, MultiTimerfrom colossalai.trainer import Trainer, hooksfrom colossalai.nn.metric import Accuracyfrom torchvision import transformsfrom colossalai.nn.lr\\_scheduler import CosineAnnealingLRfrom tqdm import tqdmfrom titans.utils import barrier\\_contextfrom colossalai.nn.lr\\_scheduler import LinearWarmupLRfrom timm.models import vit\\_tiny\\_patch16\\_224from titans.dataloader.imagenet import build\\_dali\\_imagenetfrom mixup import MixupAccuracy, MixupLossdef main():parser = colossalai.get\\_default\\_parser()args = parser.parse\\_args()colossalai.launch\\_from\\_torch(config='./config.py')logger = get\\_dist\\_logger()# build modelmodel = vit\\_tiny\\_patch16\\_224(num\\_classes=5, drop\\_rate=0.1)# build dataloaderroot = os.environ.get('DATA', '../train\\_val\\_tfrecord')train\\_dataloader, test\\_dataloader = build\\_dali\\_imagenet(root, rand\\_augment=True)# build criterioncriterion = MixupLoss(loss\\_fn\\_cls=torch.nn.CrossEntropyLoss)# optimizeroptimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight\\_decay=5e-4)# lr\\_schedulerlr\\_scheduler = CosineAnnealingLR(optimizer, total\\_steps=gpc.config.NUM\\_EPOCHS)engine, train\\_dataloader, test\\_dataloader, \\_ = colossalai.initialize(model,optimizer,criterion,train\\_dataloader,test\\_dataloader,)# build a timer to measure timetimer = MultiTimer()# create a trainer objecttrainer = Trainer(engine=engine, timer=timer, logger=logger)# define the hooks to attach to the trainerhook\\_list = [hooks.LossHook(),hooks.LRSchedulerHook(lr\\_scheduler=lr\\_scheduler, by\\_epoch=True),hooks.AccuracyHook(accuracy\\_func=MixupAccuracy()),hooks.LogMetricByEpochHook(logger),hooks.LogMemoryByEpochHook(logger),hooks.LogTimingByEpochHook(timer, logger),hooks.TensorboardHook(log\\_dir='./tb\\_logs', ranks=[0]),hooks.SaveCheckpointHook(checkpoint\\_dir='./ckpt')]# start trainingtrainer.fit(train\\_dataloader=train\\_dataloader,epochs=gpc.config.NUM\\_EPOCHS,test\\_dataloader=test\\_dataloader,test\\_interval=1,hooks=hook\\_list,display\\_progress=True)if \\_\\_name\\_\\_ == '\\_\\_main\\_\\_':main()
from colossalai.amp import AMP\\_TYPEBATCH\\_SIZE = 128DROP\\_RATE = 0.1NUM\\_EPOCHS = 200CONFIG = dict(fp16=dict(mode=AMP\\_TYPE.TORCH))gradient\\_accumulation = 16clip\\_grad\\_norm = 1.0dali = dict(gpu\\_aug=True,mixup\\_alpha=0.2)
本页内容是否解决了您的问题?