Hello Mat

 找回密码
 立即注册
查看: 3076|回复: 0

PyTorch一维数据处理

[复制链接]

1323

主题

1551

帖子

0

金钱

管理员

Rank: 9Rank: 9Rank: 9

积分
22647
发表于 2017-9-14 23:49:18 | 显示全部楼层 |阅读模式
电脑配置:WIN10-64bit-Anaconda3-2018.12-Windows-x86_64-numpy-1.15.4+vanilla-cp37-cp37m-win_amd64.whl-opencv_python-4.1.0-cp37-cp37m-win_amd64.whl
Anaconda Prompt-->conda list-->pytorch=1.0.1-->opencv-python=4.1.0-->torchvision=0.2.2

主程序如下:
  1. import pandas as pd
  2. import numpy as np
  3. from collections import Counter
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.preprocessing import LabelEncoder
  6. import torch
  7. from torch.utils.data import Dataset, DataLoader
  8. import torch.optim as torch_optim
  9. import torch.nn as nn
  10. import torch.nn.functional as F
  11. from torchvision import models
  12. from datetime import datetime
  13. #from dataset import ShelterOutcomeDataset, get_default_device
  14. from dataset4 import *
  15. from fasterAIModel4 import *

  16. # Load Data
  17. train = pd.read_csv(r'./Data/train.csv')
  18. print("Shape:", train.shape)
  19. train.head()

  20. test = pd.read_csv(r'./Data/test.csv')
  21. print("Shape:", test.shape)
  22. test.head()

  23. sample = pd.read_csv(r'./Data/sample_submission.csv')
  24. sample.head()

  25. # Very basic data exploration
  26. Counter(train['OutcomeType'])
  27. Counter(train['Name']).most_common(5)

  28. # Data preprocessing
  29. train_X = train.drop(columns= ['OutcomeType', 'OutcomeSubtype', 'AnimalID'])
  30. Y = train['OutcomeType']
  31. test_X = test

  32. # Stacking train and test set so that they undergo the same preprocessing
  33. stacked_df = train_X.append(test_X.drop(columns=['ID']))
  34. # stacked_df['DateTime'] = pd.to_datetime(stacked_df['DateTime'])
  35. # stacked_df['year'] = stacked_df['DateTime'].dt.year
  36. # stacked_df['month'] = stacked_df['DateTime'].dt.month
  37. stacked_df = stacked_df.drop(columns=['DateTime'])
  38. stacked_df.head()

  39. # dropping columns with too many nulls
  40. for col in stacked_df.columns:
  41.     if stacked_df[col].isnull().sum() > 10000:
  42.         print("dropping", col, stacked_df[col].isnull().sum())
  43.         stacked_df = stacked_df.drop(columns = [col])
  44. stacked_df.head()

  45. # label encoding
  46. for col in stacked_df.columns:
  47.     if stacked_df.dtypes[col] == "object":
  48.         stacked_df[col] = stacked_df[col].fillna("NA")
  49.     else:
  50.         stacked_df[col] = stacked_df[col].fillna(0)
  51.     stacked_df[col] = LabelEncoder().fit_transform(stacked_df[col])

  52. # making all variables categorical
  53. for col in stacked_df.columns:
  54.     stacked_df[col] = stacked_df[col].astype('category')

  55. # splitting back train and test
  56. X = stacked_df[0:26729]
  57. test_processed = stacked_df[26729:]

  58. #check if shape[0] matches original
  59. print("train shape: ", X.shape, "orignal: ", train.shape)
  60. print("test shape: ", test_processed.shape, "original: ", test.shape)

  61. # Encoding target
  62. Y = LabelEncoder().fit_transform(Y)
  63. #sanity check to see numbers match and matching with previous counter to create target dictionary
  64. print(Counter(train['OutcomeType']))
  65. print(Counter(Y))
  66. target_dict = {
  67.     'Return_to_owner' : 3,
  68.     'Euthanasia': 2,
  69.     'Adoption': 0,
  70.     'Transfer': 4,
  71.     'Died': 1
  72. }

  73. #train-valid split
  74. X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.10, random_state=0)
  75. X_train.head()

  76. # Choosing columns for embedding
  77. #categorical embedding for columns having more than two values
  78. embedded_cols = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories) > 1}
  79. embedded_cols
  80. embedded_col_names = embedded_cols.keys()
  81. len(X.columns) - len(embedded_cols) #number of numerical columns
  82. #Determining size of embedding
  83. #(borrowed from https://www.usfca.edu/data-institute/certificates/fundamentals-deep-learning lesson 2)
  84. embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
  85. embedding_sizes

  86. #creating train and valid datasets
  87. train_ds = ShelterOutcomeDataset(X_train, y_train, embedded_col_names)
  88. valid_ds = ShelterOutcomeDataset(X_val, y_val, embedded_col_names)

  89. device = get_default_device()
  90. device

  91. model = ShelterOutcomeModel(embedding_sizes)
  92. to_device(model, device)

  93. batch_size = 1000
  94. train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
  95. valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)

  96. train_dl = DeviceDataLoader(train_dl, device)
  97. valid_dl = DeviceDataLoader(valid_dl, device)

  98. train_loop(device, model, train_dl, valid_dl, embedding_sizes, epochs=8, lr=0.05, wd=0.00001)

  99. test_ds = ShelterOutcomeDataset(test_processed, np.zeros(len(test_processed)), embedded_col_names)
  100. batch_size = 1000
  101. test_dl = DataLoader(test_ds, batch_size=batch_size)
  102. test_dl = DeviceDataLoader(test_dl, device)

  103. preds = []
  104. with torch.no_grad():
  105.     for x,y in test_dl:
  106.         out = model(x.long())
  107.         prob = F.softmax(out, dim=1)
  108. #        print('{0}'.format(prob))
  109.         preds.append(prob)
  110. final_probs = [item for sublist in preds for item in sublist]
  111. len(final_probs)


  112. model_test = []
  113. model_test = ShelterOutcomeModel(embedding_sizes).to(device)
  114. model_test.cuda(0)        # GPU
  115. model_test = torch.jit.load('./CheckPoints/Pytorch1D.pt')
  116. model_test.eval()
  117. preds2 = []
  118. with torch.no_grad():
  119.     for x,y in test_dl:
  120.         out = model_test(x.long())
  121.         prob2 = F.softmax(out, dim=1)
  122. #        print('{0}'.format(prob))
  123.         preds2.append(prob2)
  124. final_probs2 = [item for sublist in preds2 for item in sublist]
  125. len(final_probs2)

  126.    
  127. # output
  128. sample.head()
  129. sample['Adoption'] = [float(t[0]) for t in final_probs]
  130. sample['Died'] = [float(t[1]) for t in final_probs]
  131. sample['Euthanasia'] = [float(t[2]) for t in final_probs]
  132. sample['Return_to_owner'] = [float(t[3]) for t in final_probs]
  133. sample['Transfer'] = [float(t[4]) for t in final_probs]
  134. sample.head()
  135. sample.to_csv('samp.csv', index=False)
复制代码
fasterAIModel4.py脚本如下:
  1. #import pandas as pd
  2. import numpy as np
  3. #from collections import Counter
  4. #from sklearn.model_selection import train_test_split
  5. #from sklearn.preprocessing import LabelEncoder
  6. import torch
  7. #from torch.utils.data import Dataset, DataLoader
  8. import torch.optim as torch_optim
  9. import torch.nn as nn
  10. import torch.nn.functional as F
  11. #from torchvision import models
  12. #from datetime import datetime
  13. class ShelterOutcomeModel(nn.Module):
  14.     def __init__(self, embedding_sizes):
  15.         super().__init__()
  16.         self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
  17.         n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
  18.         self.n_emb = n_emb
  19.         self.lin1 = nn.Linear(self.n_emb, 200)
  20.         self.lin2 = nn.Linear(200, 70)
  21.         self.lin3 = nn.Linear(70, 5)
  22.         self.bn1 = nn.BatchNorm1d(self.n_emb)
  23.         self.bn2 = nn.BatchNorm1d(200)
  24.         self.bn3 = nn.BatchNorm1d(70)
  25.         self.emb_drop = nn.Dropout(0.6)
  26.         self.drops = nn.Dropout(0.3)

  27.     def forward(self, x_cat):
  28.         x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
  29.         x = torch.cat(x, 1)
  30.         x = self.emb_drop(x)
  31.         x = self.bn1(x)
  32.         x = F.relu(self.lin1(x))
  33.         x = self.drops(x)
  34.         x = self.bn2(x)
  35.         x = F.relu(self.lin2(x))
  36.         x = self.drops(x)
  37.         x = self.bn3(x)
  38.         x = self.lin3(x)
  39.         return x
  40.    
  41. def get_optimizer(model, lr = 0.001, wd = 0.0):
  42.     parameters = filter(lambda p: p.requires_grad, model.parameters())
  43.     optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
  44.     return optim

  45. def train_model(model, optim, train_dl):
  46.     model.train()
  47.     total = 0
  48.     sum_loss = 0
  49.     for x, y in train_dl:
  50.         batch = y.shape[0]
  51.         output = model(x.long())
  52.         loss = F.cross_entropy(output, y.long())   
  53.         optim.zero_grad()
  54.         loss.backward()
  55.         optim.step()
  56.         total += batch
  57.         sum_loss += batch*(loss.item())
  58.     return sum_loss/total

  59. def val_model(model, valid_dl):
  60.     model.eval()
  61.     total = 0
  62.     sum_loss = 0
  63.     correct = 0
  64.     for x, y in valid_dl:
  65.         current_batch_size = y.shape[0]
  66.         out = model(x.long())
  67.         loss = F.cross_entropy(out, y.long())
  68.         sum_loss += current_batch_size*(loss.item())
  69.         total += current_batch_size
  70.         pred = torch.max(out, 1)[1]
  71. #        print(pred.type())
  72. #        print(y.type())
  73.         correct += (pred == y.long()).float().sum().item()
  74. #    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
  75.     return sum_loss/total, correct/total

  76. def train_loop(device, model, train_dl, valid_dl, embedding_sizes, epochs, lr=0.01, wd=0.0):
  77.     optim = get_optimizer(model, lr = lr, wd = wd)
  78.     train_loss_iter = np.inf
  79.     val_loss_iter = np.inf
  80.     for i in range(epochs):
  81.         train_loss = train_model(model, optim, train_dl)
  82.         if train_loss < train_loss_iter:
  83.             train_loss_iter = train_loss
  84.             torch.save(model.state_dict(), './CheckPoints/train_weights.pth')
  85.         val_loss, val_Accuracy = val_model(model, valid_dl)
  86.         print("epochs: {0}".format(i), "training loss: {:.5f}".format(train_loss), "Val loss: {:.5f}".format(val_loss), "Val Accuracy: {:.5f}".format(val_Accuracy) )
  87.         if val_loss < val_loss_iter:
  88.             val_loss_iter = val_loss
  89.             torch.save(model.state_dict(), './CheckPoints/val_weights.pth')
  90.     # 模型保存
  91.     model_test = []
  92.     model_test = ShelterOutcomeModel(embedding_sizes).to(device)
  93.     model_test = model_test.to(device)
  94. #    model_test.cuda(0)        # GPU
  95.     model_test.load_state_dict(torch.load('./CheckPoints/val_weights.pth'))
  96.     model_test.eval()
  97.     example = torch.rand(1, 5).type(torch.LongTensor).cuda(0)
  98.     with torch.no_grad():
  99.         model_test = torch.jit.trace(model_test, example)
  100.     model_test.save( './CheckPoints/Pytorch1D.pt' )
复制代码
dataset.py脚本如下:
  1. #import pandas as pd
  2. import numpy as np
  3. #from collections import Counter
  4. #from sklearn.model_selection import train_test_split
  5. #from sklearn.preprocessing import LabelEncoder
  6. import torch
  7. from torch.utils.data import Dataset, DataLoader
  8. #import torch.optim as torch_optim
  9. #import torch.nn as nn
  10. #import torch.nn.functional as F
  11. #from torchvision import models
  12. #from datetime import datetime
  13. # Pytorch Dataset
  14. class ShelterOutcomeDataset(Dataset):
  15.     def __init__(self, X, Y, embedded_col_names):
  16. #        X = X.copy()
  17. #        self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
  18. #        self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
  19. #        self.y = Y
  20.         self.x = X.copy().values.astype(np.float32)
  21.         self.y = Y
  22.         
  23.     def __len__(self):
  24.         return len(self.y)
  25.    
  26.     def __getitem__(self, idx):
  27.         return self.x[idx], self.y[idx]

  28. def get_default_device():
  29.     """Pick GPU if available, else CPU"""
  30.     if torch.cuda.is_available():
  31.         return torch.device('cuda')
  32.     else:
  33.         return torch.device('cpu')
  34.    
  35. def to_device(data, device):
  36.     """Move tensor(s) to chosen device"""
  37.     if isinstance(data, (list,tuple)):
  38.         return [to_device(x, device) for x in data]
  39.     return data.to(device, non_blocking=True)

  40. class DeviceDataLoader():
  41.     """Wrap a dataloader to move data to a device"""
  42.     def __init__(self, dl, device):
  43.         self.dl = dl
  44.         self.device = device
  45.         
  46.     def __iter__(self):
  47.         """Yield a batch of data after moving it to device"""
  48.         for b in self.dl:
  49.             yield to_device(b, self.device)

  50.     def __len__(self):
  51.         """Number of batches"""
  52.         return len(self.dl)
复制代码



参考:
【1】用PyTorch实现多层网络
【2】Pytorch实战Kaggle房价预测比赛
【3】如何构建用于垃圾分类的图像分类器
【4】使用PyTorch进行表格数据的深度学习
【5】https://jovian.ml/aakanksha-ns/shelter-outcome
【6】PyTorch C++ libtorch的使用方法(2)-Qt中调用PyTorch模型




本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?立即注册

x
算法QQ  3283892722
群智能算法链接http://halcom.cn/forum.php?mod=forumdisplay&fid=73
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

Python|Opencv|MATLAB|Halcom.cn ( 蜀ICP备16027072号 )

GMT+8, 2024-11-22 23:14 , Processed in 0.229813 second(s), 22 queries .

Powered by Discuz! X3.4

Copyright © 2001-2021, Tencent Cloud.

快速回复 返回顶部 返回列表