Underwhelming performance in neural network with PyTorch

I am trying to train a network to classify cat vs dog images. I’m only getting about 75% correct at this point. Is there anything obvious (to an expert) that I can do to make it perform better?

The lastest img_train.py code file is at: https://github.com/sjhalayka/pytorch_cats_vs_dogs/blob/main/img_train.py

The data that I’m using are at: Cat and Dog | Kaggle

For the record, the current code is:

import numpy as np
import math
import cv2
import random
import torch
from torch import flatten
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

import os.path
from os import path



img_width = 64
num_channels = 3

#num_input_components = img_width*img_width*num_channels
num_output_components = 2

num_epochs = 1000
learning_rate = 0.001






class Net(torch.nn.Module):
	def __init__(self, num_channels, num_output_components, all_train_files_len):
		super().__init__()
		self.model = torch.nn.Sequential(
		    #Input = 3 x 32 x 32, Output = 32 x 32 x 32
		    torch.nn.Conv2d(in_channels = num_channels, out_channels = 32, kernel_size = 3, padding = 1), 
		    torch.nn.ReLU(),
		    #Input = 32 x 32 x 32, Output = 32 x 16 x 16
		    torch.nn.MaxPool2d(kernel_size=2),
  
		    #Input = 32 x 16 x 16, Output = 64 x 16 x 16
		    torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
		    torch.nn.ReLU(),
		    #Input = 64 x 16 x 16, Output = 64 x 8 x 8
		    torch.nn.MaxPool2d(kernel_size=2),
		      
		    #Input = 64 x 8 x 8, Output = 64 x 8 x 8
		    torch.nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, padding = 1),
		    torch.nn.ReLU(),
		    #Input = 64 x 8 x 8, Output = 64 x 4 x 4
		    torch.nn.MaxPool2d(kernel_size=2),
  
		    torch.nn.Flatten(),
		    torch.nn.Linear(4096, all_train_files_len),
		    torch.nn.ReLU(),
		    torch.nn.Linear(all_train_files_len, num_output_components)
		)
  
	def forward(self, x):
		return self.model(x)

"""
	def __init__(self):
		super(Net, self).__init__()
		self.hidden1 = torch.nn.Linear(num_input_components, 8192)
		self.hidden2 = torch.nn.Linear(8192, 1024) 
		self.hidden3 = torch.nn.Linear(1024, 128)
		self.predict = torch.nn.Linear(128, num_output_components)

	def forward(self, x):
		x = torch.tanh(self.hidden1(x))		
		x = torch.tanh(self.hidden2(x))
		x = torch.tanh(self.hidden3(x))
		x = self.predict(x)    # linear output
		return x
"""



class float_image:

	def __init__(self, img):
		self.img = img

class image_type:

	def __init__(self, img_type, float_img):
		self.img_type = img_type
		self.float_img = float_img




if False: #path.exists('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'):
	net.load_state_dict(torch.load('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'))
	print("loaded file successfully")
else:
	print("training...")





	all_train_files = []

	file_count = 0

	path = 'training_set/cats/'
	filenames = next(os.walk(path))[2]

	for f in filenames:

		file_count = file_count + 1
		#if file_count >= 100:
		#	break;

		print(path + f)
		img = cv2.imread(path + f)
		
		if (img is None) == False:

			img = img.astype(np.float32)
			res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
			flat_file = res / 255.0
			flat_file = np.transpose(flat_file, (2, 0, 1))
			all_train_files.append(image_type(0, flat_file))

		else:
			print("image read failure")

	file_count = 0




	path = 'training_set/dogs/'
	filenames = next(os.walk(path))[2]

	for f in filenames:

		file_count = file_count + 1
		#if file_count >= 100:
		#	break;

		print(path + f)
		img = cv2.imread(path + f)
		
		if (img is None) == False:

			img = img.astype(np.float32)
			res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
			flat_file = res / 255.0
			flat_file = np.transpose(flat_file, (2, 0, 1))
			all_train_files.append(image_type(1, flat_file))

		else:
			print("image read failure")



	


	net = Net(num_channels, num_output_components, len(all_train_files))
	optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)
	loss_func = torch.nn.MSELoss()

	batch = np.zeros((len(all_train_files), num_channels, img_width, img_width), dtype=np.float32)
	ground_truth = np.zeros((len(all_train_files), num_output_components), dtype=np.float32)
	
	for epoch in range(num_epochs):
		
		random.shuffle(all_train_files)

		count = 0

		for i in all_train_files:

			batch[count] = i.float_img
		
			if i.img_type == 0: # cat

				ground_truth[count][0] = 1
				ground_truth[count][1] = 0
			
			elif i.img_type == 1: # dog
				
				ground_truth[count][0] = 0
				ground_truth[count][1] = 1

			count = count + 1
	
		x = Variable(torch.from_numpy(batch))
		y = Variable(torch.from_numpy(ground_truth))

		prediction = net(x)	 
		loss = loss_func(prediction, y)

		print(epoch, loss)

		optimizer.zero_grad()	 # clear gradients for next train
		loss.backward()		 # backpropagation, compute gradients
		optimizer.step()		# apply gradients



	#torch.save(net.state_dict(), 'weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth')



path = 'test_set/cats/'
filenames = next(os.walk(path))[2]

cat_count = 0
total_count = 0

for f in filenames:

	img = cv2.imread(path + f)
			
	if (img is None) == False:

		img = img.astype(np.float32)
		res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
		flat_file = res / 255.0
		flat_file = np.transpose(flat_file, (2, 0, 1))

	else:

		print("image read failure")
		continue

	batch = torch.zeros((1, num_channels, img_width, img_width), dtype=torch.float32)
	batch[0] = torch.from_numpy(flat_file)

	prediction = net(Variable(batch))

	if prediction[0][0] > prediction[0][1]:
		cat_count = cat_count + 1

	total_count = total_count + 1

print(cat_count / total_count)
print(total_count)





path = 'test_set/dogs/'
filenames = next(os.walk(path))[2]

dog_count = 0
total_count = 0

for f in filenames:

	img = cv2.imread(path + f)
			
	if (img is None) == False:

		img = img.astype(np.float32)
		res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
		flat_file = res / 255.0
		flat_file = np.transpose(flat_file, (2, 0, 1))

	else:

		print("image read failure")
		continue

	batch = torch.zeros((1, num_channels, img_width, img_width), dtype=torch.float32)
	batch[0] = torch.from_numpy(flat_file)

	prediction = net(Variable(batch))

	if prediction[0][0] < prediction[0][1]:
		dog_count = dog_count + 1

	total_count = total_count + 1

print(dog_count / total_count)
print(total_count)

I would use a framework like fastai with models from timm.

IIRC you should probably be able to get much better results with “3 lines of code”. For more info check the first lesson in the course for details

2 Likes

looks like a leftover from the previous regression attempt.
for classification, you want some kind of softmax as last layer

1 Like

Hi berak:

Yes, the code is a product of my experimentation with PyTorch. I am a raw beginner, so I might be doing something counterproductive. I see why you’d use the Softmax, it gives a set of probabilities that sum to 1. Reminds me of information theory. Thanks for the tip! Much appreciated.

  • Shawn

Thank you for the link.

How do I interpret the result of Softmax? How does it affect the one-hot prediction?

Thanks for any help that you can provide.

may I recommend taking this to a forum for PyTorch or general deep learning?

1 Like

i recommend cs231n in general

1 Like

Much appreciated! I’ll move this to another forum. Thanks for all of the help though.

Sorry, one last set of questions:

Would you say that a neural network performs non-linear regression? I mean, when it’s not a classification problem, it’s a regression problem, right?