|
电脑配置:WIN10-64bit-Anaconda3-2018.12-Windows-x86_64-numpy-1.15.4+vanilla-cp37-cp37m-win_amd64.whl-opencv_python-4.1.0-cp37-cp37m-win_amd64.whl
Anaconda Prompt-->conda list-->pytorch=1.0.1-->opencv-python=4.1.0-->torchvision=0.2.2
主程序如下:
- import pandas as pd
- import numpy as np
- from collections import Counter
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import LabelEncoder
- import torch
- from torch.utils.data import Dataset, DataLoader
- import torch.optim as torch_optim
- import torch.nn as nn
- import torch.nn.functional as F
- from torchvision import models
- from datetime import datetime
- #from dataset import ShelterOutcomeDataset, get_default_device
- from dataset4 import *
- from fasterAIModel4 import *
- # Load Data
- train = pd.read_csv(r'./Data/train.csv')
- print("Shape:", train.shape)
- train.head()
- test = pd.read_csv(r'./Data/test.csv')
- print("Shape:", test.shape)
- test.head()
- sample = pd.read_csv(r'./Data/sample_submission.csv')
- sample.head()
- # Very basic data exploration
- Counter(train['OutcomeType'])
- Counter(train['Name']).most_common(5)
- # Data preprocessing
- train_X = train.drop(columns= ['OutcomeType', 'OutcomeSubtype', 'AnimalID'])
- Y = train['OutcomeType']
- test_X = test
- # Stacking train and test set so that they undergo the same preprocessing
- stacked_df = train_X.append(test_X.drop(columns=['ID']))
- # stacked_df['DateTime'] = pd.to_datetime(stacked_df['DateTime'])
- # stacked_df['year'] = stacked_df['DateTime'].dt.year
- # stacked_df['month'] = stacked_df['DateTime'].dt.month
- stacked_df = stacked_df.drop(columns=['DateTime'])
- stacked_df.head()
- # dropping columns with too many nulls
- for col in stacked_df.columns:
- if stacked_df[col].isnull().sum() > 10000:
- print("dropping", col, stacked_df[col].isnull().sum())
- stacked_df = stacked_df.drop(columns = [col])
- stacked_df.head()
- # label encoding
- for col in stacked_df.columns:
- if stacked_df.dtypes[col] == "object":
- stacked_df[col] = stacked_df[col].fillna("NA")
- else:
- stacked_df[col] = stacked_df[col].fillna(0)
- stacked_df[col] = LabelEncoder().fit_transform(stacked_df[col])
- # making all variables categorical
- for col in stacked_df.columns:
- stacked_df[col] = stacked_df[col].astype('category')
- # splitting back train and test
- X = stacked_df[0:26729]
- test_processed = stacked_df[26729:]
- #check if shape[0] matches original
- print("train shape: ", X.shape, "orignal: ", train.shape)
- print("test shape: ", test_processed.shape, "original: ", test.shape)
- # Encoding target
- Y = LabelEncoder().fit_transform(Y)
- #sanity check to see numbers match and matching with previous counter to create target dictionary
- print(Counter(train['OutcomeType']))
- print(Counter(Y))
- target_dict = {
- 'Return_to_owner' : 3,
- 'Euthanasia': 2,
- 'Adoption': 0,
- 'Transfer': 4,
- 'Died': 1
- }
- #train-valid split
- X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.10, random_state=0)
- X_train.head()
- # Choosing columns for embedding
- #categorical embedding for columns having more than two values
- embedded_cols = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories) > 1}
- embedded_cols
- embedded_col_names = embedded_cols.keys()
- len(X.columns) - len(embedded_cols) #number of numerical columns
- #Determining size of embedding
- #(borrowed from https://www.usfca.edu/data-institute/certificates/fundamentals-deep-learning lesson 2)
- embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
- embedding_sizes
- #creating train and valid datasets
- train_ds = ShelterOutcomeDataset(X_train, y_train, embedded_col_names)
- valid_ds = ShelterOutcomeDataset(X_val, y_val, embedded_col_names)
- device = get_default_device()
- device
- model = ShelterOutcomeModel(embedding_sizes)
- to_device(model, device)
- batch_size = 1000
- train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
- valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)
- train_dl = DeviceDataLoader(train_dl, device)
- valid_dl = DeviceDataLoader(valid_dl, device)
- train_loop(device, model, train_dl, valid_dl, embedding_sizes, epochs=8, lr=0.05, wd=0.00001)
- test_ds = ShelterOutcomeDataset(test_processed, np.zeros(len(test_processed)), embedded_col_names)
- batch_size = 1000
- test_dl = DataLoader(test_ds, batch_size=batch_size)
- test_dl = DeviceDataLoader(test_dl, device)
- preds = []
- with torch.no_grad():
- for x,y in test_dl:
- out = model(x.long())
- prob = F.softmax(out, dim=1)
- # print('{0}'.format(prob))
- preds.append(prob)
- final_probs = [item for sublist in preds for item in sublist]
- len(final_probs)
- model_test = []
- model_test = ShelterOutcomeModel(embedding_sizes).to(device)
- model_test.cuda(0) # GPU
- model_test = torch.jit.load('./CheckPoints/Pytorch1D.pt')
- model_test.eval()
- preds2 = []
- with torch.no_grad():
- for x,y in test_dl:
- out = model_test(x.long())
- prob2 = F.softmax(out, dim=1)
- # print('{0}'.format(prob))
- preds2.append(prob2)
- final_probs2 = [item for sublist in preds2 for item in sublist]
- len(final_probs2)
-
- # output
- sample.head()
- sample['Adoption'] = [float(t[0]) for t in final_probs]
- sample['Died'] = [float(t[1]) for t in final_probs]
- sample['Euthanasia'] = [float(t[2]) for t in final_probs]
- sample['Return_to_owner'] = [float(t[3]) for t in final_probs]
- sample['Transfer'] = [float(t[4]) for t in final_probs]
- sample.head()
- sample.to_csv('samp.csv', index=False)
复制代码 fasterAIModel4.py脚本如下:
- #import pandas as pd
- import numpy as np
- #from collections import Counter
- #from sklearn.model_selection import train_test_split
- #from sklearn.preprocessing import LabelEncoder
- import torch
- #from torch.utils.data import Dataset, DataLoader
- import torch.optim as torch_optim
- import torch.nn as nn
- import torch.nn.functional as F
- #from torchvision import models
- #from datetime import datetime
- class ShelterOutcomeModel(nn.Module):
- def __init__(self, embedding_sizes):
- super().__init__()
- self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
- n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
- self.n_emb = n_emb
- self.lin1 = nn.Linear(self.n_emb, 200)
- self.lin2 = nn.Linear(200, 70)
- self.lin3 = nn.Linear(70, 5)
- self.bn1 = nn.BatchNorm1d(self.n_emb)
- self.bn2 = nn.BatchNorm1d(200)
- self.bn3 = nn.BatchNorm1d(70)
- self.emb_drop = nn.Dropout(0.6)
- self.drops = nn.Dropout(0.3)
- def forward(self, x_cat):
- x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
- x = torch.cat(x, 1)
- x = self.emb_drop(x)
- x = self.bn1(x)
- x = F.relu(self.lin1(x))
- x = self.drops(x)
- x = self.bn2(x)
- x = F.relu(self.lin2(x))
- x = self.drops(x)
- x = self.bn3(x)
- x = self.lin3(x)
- return x
-
- def get_optimizer(model, lr = 0.001, wd = 0.0):
- parameters = filter(lambda p: p.requires_grad, model.parameters())
- optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
- return optim
- def train_model(model, optim, train_dl):
- model.train()
- total = 0
- sum_loss = 0
- for x, y in train_dl:
- batch = y.shape[0]
- output = model(x.long())
- loss = F.cross_entropy(output, y.long())
- optim.zero_grad()
- loss.backward()
- optim.step()
- total += batch
- sum_loss += batch*(loss.item())
- return sum_loss/total
- def val_model(model, valid_dl):
- model.eval()
- total = 0
- sum_loss = 0
- correct = 0
- for x, y in valid_dl:
- current_batch_size = y.shape[0]
- out = model(x.long())
- loss = F.cross_entropy(out, y.long())
- sum_loss += current_batch_size*(loss.item())
- total += current_batch_size
- pred = torch.max(out, 1)[1]
- # print(pred.type())
- # print(y.type())
- correct += (pred == y.long()).float().sum().item()
- # print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
- return sum_loss/total, correct/total
- def train_loop(device, model, train_dl, valid_dl, embedding_sizes, epochs, lr=0.01, wd=0.0):
- optim = get_optimizer(model, lr = lr, wd = wd)
- train_loss_iter = np.inf
- val_loss_iter = np.inf
- for i in range(epochs):
- train_loss = train_model(model, optim, train_dl)
- if train_loss < train_loss_iter:
- train_loss_iter = train_loss
- torch.save(model.state_dict(), './CheckPoints/train_weights.pth')
- val_loss, val_Accuracy = val_model(model, valid_dl)
- print("epochs: {0}".format(i), "training loss: {:.5f}".format(train_loss), "Val loss: {:.5f}".format(val_loss), "Val Accuracy: {:.5f}".format(val_Accuracy) )
- if val_loss < val_loss_iter:
- val_loss_iter = val_loss
- torch.save(model.state_dict(), './CheckPoints/val_weights.pth')
- # 模型保存
- model_test = []
- model_test = ShelterOutcomeModel(embedding_sizes).to(device)
- model_test = model_test.to(device)
- # model_test.cuda(0) # GPU
- model_test.load_state_dict(torch.load('./CheckPoints/val_weights.pth'))
- model_test.eval()
- example = torch.rand(1, 5).type(torch.LongTensor).cuda(0)
- with torch.no_grad():
- model_test = torch.jit.trace(model_test, example)
- model_test.save( './CheckPoints/Pytorch1D.pt' )
复制代码 dataset.py脚本如下:
- #import pandas as pd
- import numpy as np
- #from collections import Counter
- #from sklearn.model_selection import train_test_split
- #from sklearn.preprocessing import LabelEncoder
- import torch
- from torch.utils.data import Dataset, DataLoader
- #import torch.optim as torch_optim
- #import torch.nn as nn
- #import torch.nn.functional as F
- #from torchvision import models
- #from datetime import datetime
- # Pytorch Dataset
- class ShelterOutcomeDataset(Dataset):
- def __init__(self, X, Y, embedded_col_names):
- # X = X.copy()
- # self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
- # self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
- # self.y = Y
- self.x = X.copy().values.astype(np.float32)
- self.y = Y
-
- def __len__(self):
- return len(self.y)
-
- def __getitem__(self, idx):
- return self.x[idx], self.y[idx]
- def get_default_device():
- """Pick GPU if available, else CPU"""
- if torch.cuda.is_available():
- return torch.device('cuda')
- else:
- return torch.device('cpu')
-
- def to_device(data, device):
- """Move tensor(s) to chosen device"""
- if isinstance(data, (list,tuple)):
- return [to_device(x, device) for x in data]
- return data.to(device, non_blocking=True)
- class DeviceDataLoader():
- """Wrap a dataloader to move data to a device"""
- def __init__(self, dl, device):
- self.dl = dl
- self.device = device
-
- def __iter__(self):
- """Yield a batch of data after moving it to device"""
- for b in self.dl:
- yield to_device(b, self.device)
- def __len__(self):
- """Number of batches"""
- return len(self.dl)
复制代码
参考:
【1】用PyTorch实现多层网络
【2】Pytorch实战Kaggle房价预测比赛
【3】如何构建用于垃圾分类的图像分类器
【4】使用PyTorch进行表格数据的深度学习
【5】https://jovian.ml/aakanksha-ns/shelter-outcome
【6】PyTorch C++ libtorch的使用方法(2)-Qt中调用PyTorch模型
|
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有帐号?立即注册
x
|