Classifying valence bond structures

Valenzstrich

Test of a simple image classifier, using PyTorch and TensorFlow

Based on the TensoFlow turorial: https://www.tensorflow.org/tutorials/images/classification

In [1]:

import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf
import os

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from pathlib import *

In [2]:

import pathlib
dataset = "Valenzstrich.tar"
dataset_path = os.path.abspath("./" + dataset)
data_dir = tf.keras.utils.get_file('Valenzstrich.tar', origin='file:\\' + dataset_path, extract=True)
data_dir = pathlib.Path(data_dir).with_suffix('')
print(data_dir)

C:\Users\Marcus\.keras\datasets\Valenzstrich

Outside of Jupyter Notebooks it must be “file://”

In [3]:

image_count = len(list(data_dir.glob('*/*.png')))
print(image_count)

In [4]:

alkohols = list(data_dir.glob('Alkohole/*'))
PIL.Image.open(str(alkohols[0]))

Out[4]:

No description has been provided for this image

In [5]:

PIL.Image.open(str(alkohols[1]))

Out[5]:

In [6]:

amine = list(data_dir.glob('Amine/*'))
PIL.Image.open(str(amine[0]))

Out[6]:

In [7]:

PIL.Image.open(str(amine[1]))

Out[7]:

Next, load these images off disk using the helpful tf.keras.utils.image_dataset_from_directory utility. This will take you from a directory of images on disk to a tf.data.Dataset in just a couple lines of code. If you like, you can also write your own data loading code from scratch by visiting the Load and preprocess images tutorial.

Create a dataset Define some parameters for the loader:

In [8]:

batch_size = 64
img_height = 100
img_width = 100

Use 80% of the images for training and 20% for validation.

In [9]:

train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Found 2386 files belonging to 9 classes.
Using 1909 files for training.

In [10]:

val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Found 2386 files belonging to 9 classes.
Using 477 files for validation.

In [11]:

class_names = train_ds.class_names
print(class_names)

['Aldehyde', 'Alkane', 'Alkohole', 'Amine', 'Bromide', 'Chloride', 'Ketone', 'Saeuren', 'Thiole']

In [12]:

import matplotlib.pyplot as plt

plt.figure(figsize=(5, 5))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [13]:

for image_batch, labels_batch in train_ds:
  print(image_batch.shape)
  print(labels_batch.shape)
  break

(64, 100, 100, 3)
(64,)

The image_batch is a tensor of the shape (64, 100, 100, 3). This is a batch of 64 images of shape 100x100x3 (the last dimension refers to color channels RGB). The label_batch is a tensor of the shape (64,), these are corresponding labels to the 64 images.

In [14]:

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [15]:

normalization_layer = layers.Rescaling(1./255)

In [16]:

normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))

0.0 1.0

Create the model

In [17]:

num_classes = len(class_names)

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [18]:

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [19]:

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 rescaling_1 (Rescaling)     (None, 100, 100, 3)       0         
                                                                 
 conv2d (Conv2D)             (None, 100, 100, 16)      448       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 50, 50, 16)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 50, 50, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 25, 25, 32)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 25, 25, 64)        18496     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 12, 12, 64)       0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 9216)              0         
                                                                 
 dense (Dense)               (None, 128)               1179776   
                                                                 
 dense_1 (Dense)             (None, 9)                 1161      
                                                                 
=================================================================
Total params: 1,204,521
Trainable params: 1,204,521
Non-trainable params: 0
_________________________________________________________________

In [20]:

epochs=10
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

Epoch 1/10
30/30 [==============================] - 5s 140ms/step - loss: 2.2010 - accuracy: 0.1676 - val_loss: 2.1495 - val_accuracy: 0.1929
Epoch 2/10
30/30 [==============================] - 4s 130ms/step - loss: 2.0907 - accuracy: 0.2090 - val_loss: 2.0820 - val_accuracy: 0.2222
Epoch 3/10
30/30 [==============================] - 4s 127ms/step - loss: 1.9031 - accuracy: 0.2996 - val_loss: 1.9083 - val_accuracy: 0.3040
Epoch 4/10
30/30 [==============================] - 4s 138ms/step - loss: 1.6679 - accuracy: 0.3855 - val_loss: 1.8077 - val_accuracy: 0.2956
Epoch 5/10
30/30 [==============================] - 4s 131ms/step - loss: 1.3560 - accuracy: 0.4976 - val_loss: 1.4822 - val_accuracy: 0.4172
Epoch 6/10
30/30 [==============================] - 5s 152ms/step - loss: 1.0244 - accuracy: 0.6260 - val_loss: 1.2118 - val_accuracy: 0.5283
Epoch 7/10
30/30 [==============================] - 4s 147ms/step - loss: 0.7442 - accuracy: 0.7213 - val_loss: 1.0752 - val_accuracy: 0.5262
Epoch 8/10
30/30 [==============================] - 4s 143ms/step - loss: 0.6019 - accuracy: 0.7664 - val_loss: 0.9610 - val_accuracy: 0.6038
Epoch 9/10
30/30 [==============================] - 4s 133ms/step - loss: 0.4826 - accuracy: 0.8130 - val_loss: 0.9414 - val_accuracy: 0.6415
Epoch 10/10
30/30 [==============================] - 4s 143ms/step - loss: 0.4068 - accuracy: 0.8355 - val_loss: 0.9346 - val_accuracy: 0.6164

In [21]:

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [22]:

print(os.getcwd())
molekul_path = os.path.join(os.getcwd(), "Molekul.png")

img = tf.keras.utils.load_img(
    molekul_path, target_size=(img_height, img_width)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = tf.expand_dims(img_array, 0) # Create a batch

predictions = model.predict(img_array)
score = tf.nn.softmax(predictions[0])

print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(class_names[np.argmax(score)], 100 * np.max(score))
)
PIL.Image.open(molekul_path)

C:\Users\Marcus
1/1 [==============================] - 0s 114ms/step
This image most likely belongs to Amine with a 83.97 percent confidence.

Out[22]:

In [23]:

print(os.getcwd())
molekul_path = os.path.join(os.getcwd(), "Molekul_2.png")

img = tf.keras.utils.load_img(
    molekul_path, target_size=(img_height, img_width)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = tf.expand_dims(img_array, 0) # Create a batch

predictions = model.predict(img_array)
score = tf.nn.softmax(predictions[0])

print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(class_names[np.argmax(score)], 100 * np.max(score))
)
PIL.Image.open(molekul_path)

C:\Users\Marcus
1/1 [==============================] - 0s 16ms/step
This image most likely belongs to Chloride with a 92.19 percent confidence.

Out[23]:

In [24]:

print(os.getcwd())
molekul_path = os.path.join(os.getcwd(), "Molekul_3.png")

img = tf.keras.utils.load_img(
    molekul_path, target_size=(img_height, img_width)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = tf.expand_dims(img_array, 0) # Create a batch

predictions = model.predict(img_array)
score = tf.nn.softmax(predictions[0])

print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(class_names[np.argmax(score)], 100 * np.max(score))
)
PIL.Image.open(molekul_path)

C:\Users\Marcus
1/1 [==============================] - 0s 16ms/step
This image most likely belongs to Saeuren with a 99.65 percent confidence.

Out[24]: