Code
library(imager)
library(keras)
library(tidyverse)
This tutorial is sparked by a conversation with Dr. May Wang years ago. Also see this blog. This note is not to be reproduced on Posit Cloud. It requires downloading the food image data from Kaggle and significant computing resource (memory, GPU) to run the code.
library(imager)
library(keras)
library(tidyverse)
Keywords: Convolution neural network (CNN), transfer learning.
We will use the Food-101 dataset, which contains 101 food categories, each with 1,000 images. The images are of varying sizes and aspect ratios. We will resize the images to 224x224 pixels, which is the input size required by the pre-trained model we will use.
ls food/images
apple_pie
baby_back_ribs
baklava
beef_carpaccio
beef_tartare
beet_salad
beignets
bibimbap
bread_pudding
breakfast_burrito
bruschetta
caesar_salad
cannoli
caprese_salad
carrot_cake
ceviche
cheese_plate
cheesecake
chicken_curry
chicken_quesadilla
chicken_wings
chocolate_cake
chocolate_mousse
churros
clam_chowder
club_sandwich
crab_cakes
creme_brulee
croque_madame
cup_cakes
deviled_eggs
donuts
dumplings
edamame
eggs_benedict
escargots
falafel
filet_mignon
fish_and_chips
foie_gras
french_fries
french_onion_soup
french_toast
fried_calamari
fried_rice
frozen_yogurt
garlic_bread
gnocchi
greek_salad
grilled_cheese_sandwich
grilled_salmon
guacamole
gyoza
hamburger
hot_and_sour_soup
hot_dog
huevos_rancheros
hummus
ice_cream
lasagna
lobster_bisque
lobster_roll_sandwich
macaroni_and_cheese
macarons
miso_soup
mussels
nachos
omelette
onion_rings
oysters
pad_thai
paella
pancakes
panna_cotta
peking_duck
pho
pizza
pork_chop
poutine
prime_rib
pulled_pork_sandwich
ramen
ravioli
red_velvet_cake
risotto
samosa
sashimi
scallops
seaweed_salad
shrimp_and_grits
spaghetti_bolognese
spaghetti_carbonara
spring_rolls
steak
strawberry_shortcake
sushi
tacos
takoyaki
tiramisu
tuna_tartare
waffles
Create a tibble of file paths and labels for all food images.
<- list.dirs("food/images", full.names = TRUE, recursive = FALSE) %>%
image_df map_dfr(~tibble(filepath = list.files(.x, full.names = TRUE), label = basename(.x))) %>%
# mutate(filepath = str_c(getwd(), filepath, sep = "/")) %>%
mutate(label = factor(label)) %>%
print()
# A tibble: 101,000 × 2
filepath label
<chr> <fct>
1 food/images/apple_pie/1005649.jpg apple_pie
2 food/images/apple_pie/1011328.jpg apple_pie
3 food/images/apple_pie/101251.jpg apple_pie
4 food/images/apple_pie/1014775.jpg apple_pie
5 food/images/apple_pie/1026328.jpg apple_pie
6 food/images/apple_pie/1028787.jpg apple_pie
7 food/images/apple_pie/1034399.jpg apple_pie
8 food/images/apple_pie/103801.jpg apple_pie
9 food/images/apple_pie/1038694.jpg apple_pie
10 food/images/apple_pie/1043283.jpg apple_pie
# ℹ 100,990 more rows
Show a few random sample images.
set.seed(2024)
<- 6
num_images # par(mar = c(1, 1, 1, 1), mfrow = c(2, 3))
for (i in 1:num_images) {
<- image_df |> slice_sample(n = 1)
img plot(load.image(img[["filepath"]]))
title(img[["label"]])
};
How many images per food category?
%>%
image_df count(label)
# A tibble: 101 × 2
label n
<fct> <int>
1 apple_pie 1000
2 baby_back_ribs 1000
3 baklava 1000
4 beef_carpaccio 1000
5 beef_tartare 1000
6 beet_salad 1000
7 beignets 1000
8 bibimbap 1000
9 bread_pudding 1000
10 breakfast_burrito 1000
# ℹ 91 more rows
To save trees, we will only use only 100 images per food category.
<- image_df %>%
image_df group_by(label) %>%
slice_sample(n = 100) %>%
ungroup() %>%
print()
# A tibble: 10,100 × 2
filepath label
<chr> <fct>
1 food/images/apple_pie/3760032.jpg apple_pie
2 food/images/apple_pie/793078.jpg apple_pie
3 food/images/apple_pie/1220194.jpg apple_pie
4 food/images/apple_pie/1954841.jpg apple_pie
5 food/images/apple_pie/2517059.jpg apple_pie
6 food/images/apple_pie/80735.jpg apple_pie
7 food/images/apple_pie/1106961.jpg apple_pie
8 food/images/apple_pie/1622276.jpg apple_pie
9 food/images/apple_pie/2923139.jpg apple_pie
10 food/images/apple_pie/2538221.jpg apple_pie
# ℹ 10,090 more rows
Sanity check:
%>%
image_df count(label)
# A tibble: 101 × 2
label n
<fct> <int>
1 apple_pie 100
2 baby_back_ribs 100
3 baklava 100
4 beef_carpaccio 100
5 beef_tartare 100
6 beet_salad 100
7 beignets 100
8 bibimbap 100
9 bread_pudding 100
10 breakfast_burrito 100
# ℹ 91 more rows
We split the data into 60% training, 20% validation, and 20% testing set.
set.seed(2024)
<- c(train = .6, test = .2, validate = .2)
spec
= sample(cut(
g seq(nrow(image_df)),
nrow(image_df) * cumsum(c(0, spec)),
labels = names(spec)
))
<- split(image_df, g) image_split
Training set.
<- image_split$train
train_data <- array(dim = c(nrow(train_data), 224, 224, 3))
x_train for (i in 1:nrow(train_data)) {
<- image_load(train_data$filepath[i], target_size = c(224, 224))
img <- image_to_array(img)
x_train[i, , ,]
}<- mobilenet_preprocess_input(x_train)
x_train # x_train <- resnet_v2_preprocess_input(x_train)
# x_train <- imagenet_preprocess_input(x_train)
dim(x_train)
[1] 6060 224 224 3
Encode label as binary class matrix.
<- to_categorical(as.integer(factor(train_data$label)) - 1)
y_train dim(y_train)
[1] 6060 101
Validation set.
<- image_split$validate
validate_data <- array(dim = c(nrow(validate_data), 224, 224, 3))
x_validate for (i in 1:nrow(validate_data)) {
<- image_load(validate_data$filepath[i], target_size = c(224, 224))
img <- image_to_array(img)
x_validate[i, , ,]
}<- mobilenet_preprocess_input(x_validate)
x_validate # x_validate <- resnet_v2_preprocess_input(x_validate)
# x_validate <- imagenet_preprocess_input(x_validate)
dim(x_validate)
[1] 2020 224 224 3
<- to_categorical(as.integer(factor(validate_data$label)) - 1)
y_validate dim(y_validate)
[1] 2020 101
Test set.
<- image_split$test
test_data <- array(dim = c(nrow(test_data), 224, 224, 3))
x_test for (i in 1:nrow(test_data)) {
<- image_load(test_data$filepath[i], target_size = c(224, 224))
img <- image_to_array(img)
x_test[i, , ,]
}<- mobilenet_preprocess_input(x_test)
x_test # x_test <- resnet_v2_preprocess_input(x_test)
# x_test <- imagenet_preprocess_input(x_test)
dim(x_test)
[1] 2020 224 224 3
<- to_categorical(as.integer(factor(test_data$label)) - 1)
y_test dim(y_test)
[1] 2020 101
Instead of building and training a CNN from scratch, we will use a pre-trained model, MobileNet, and fine-tune it to our food image dataset. MobileNet is a CNN model that has been pre-trained on the ImageNet dataset. We will use the pre-trained model to extract features from our food images and then train a new model on top of these features.
For a list of available pre-trained models, see Keras Applications.
<- application_mobilenet(
pretrained_model # pretrained_model <- application_resnet50_v2(
weights = "imagenet",
include_top = FALSE,
input_shape = c(224, 224, 3),
pooling = "avg"
)# Freeze the weights of the pre-trained model
$trainable <- FALSE pretrained_model
Add additional dense layers on top of the pre-trained model.
# Dropout doesn't seem to help
# One layter of 256 units slightly better than 2 layers of 128 units
<- keras_model_sequential() %>%
model %>%
pretrained_model layer_dense(units = 256, activation = "relu") %>%
# layer_dropout(rate = 0.5) %>%
# layer_dense(units = 128, activation = "relu") %>%
# layer_dropout(rate = 0.5) %>%
layer_dense(units = 101, activation = "softmax")
summary(model)
Model: "sequential"
________________________________________________________________________________
Layer (type) Output Shape Param # Trainable
================================================================================
mobilenet_1.00_224 (Function (None, 1024) 3228864 N
al)
dense_1 (Dense) (None, 256) 262400 Y
dense (Dense) (None, 101) 25957 Y
================================================================================
Total params: 3517221 (13.42 MB)
Trainable params: 288357 (1.10 MB)
Non-trainable params: 3228864 (12.32 MB)
________________________________________________________________________________
Compile the model.
# Slower learning rate helps
# No big difference between rmsprop and adam
%>%
model compile(
loss = "categorical_crossentropy",
# optimizer = "rmsprop",
optimizer = optimizer_rmsprop(learning_rate = 0.00005),
# optimizer = optimizer_adam(learning_rate = 0.0001),
metrics = c("accuracy")
)
<- model %>%
history fit(
x_train,
y_train,validation_data = list(x_validate, y_validate),
epochs = 50,
batch_size = 32,
shuffle = TRUE,
callbacks = list(
callback_early_stopping(
monitor = "val_accuracy",
patience = 3,
restore_best_weights = TRUE
)
) )
Epoch 1/50
190/190 - 7s - loss: 4.9758 - accuracy: 0.0175 - val_loss: 4.7267 - val_accuracy: 0.0267 - 7s/epoch - 37ms/step
Epoch 2/50
190/190 - 5s - loss: 4.5055 - accuracy: 0.0508 - val_loss: 4.4108 - val_accuracy: 0.0594 - 5s/epoch - 26ms/step
Epoch 3/50
190/190 - 5s - loss: 4.1650 - accuracy: 0.0896 - val_loss: 4.1419 - val_accuracy: 0.0950 - 5s/epoch - 26ms/step
Epoch 4/50
190/190 - 5s - loss: 3.8680 - accuracy: 0.1394 - val_loss: 3.9147 - val_accuracy: 0.1312 - 5s/epoch - 26ms/step
Epoch 5/50
190/190 - 5s - loss: 3.6009 - accuracy: 0.1898 - val_loss: 3.7026 - val_accuracy: 0.1614 - 5s/epoch - 26ms/step
Epoch 6/50
190/190 - 5s - loss: 3.3649 - accuracy: 0.2333 - val_loss: 3.5097 - val_accuracy: 0.2025 - 5s/epoch - 26ms/step
Epoch 7/50
190/190 - 5s - loss: 3.1515 - accuracy: 0.2728 - val_loss: 3.3493 - val_accuracy: 0.2257 - 5s/epoch - 26ms/step
Epoch 8/50
190/190 - 5s - loss: 2.9628 - accuracy: 0.3153 - val_loss: 3.2081 - val_accuracy: 0.2550 - 5s/epoch - 25ms/step
Epoch 9/50
190/190 - 5s - loss: 2.7954 - accuracy: 0.3538 - val_loss: 3.0802 - val_accuracy: 0.2792 - 5s/epoch - 25ms/step
Epoch 10/50
190/190 - 5s - loss: 2.6454 - accuracy: 0.3855 - val_loss: 2.9723 - val_accuracy: 0.2995 - 5s/epoch - 26ms/step
Epoch 11/50
190/190 - 5s - loss: 2.5095 - accuracy: 0.4177 - val_loss: 2.8824 - val_accuracy: 0.3163 - 5s/epoch - 26ms/step
Epoch 12/50
190/190 - 5s - loss: 2.3902 - accuracy: 0.4439 - val_loss: 2.8023 - val_accuracy: 0.3243 - 5s/epoch - 26ms/step
Epoch 13/50
190/190 - 5s - loss: 2.2810 - accuracy: 0.4662 - val_loss: 2.7216 - val_accuracy: 0.3406 - 5s/epoch - 25ms/step
Epoch 14/50
190/190 - 5s - loss: 2.1807 - accuracy: 0.4934 - val_loss: 2.6657 - val_accuracy: 0.3470 - 5s/epoch - 25ms/step
Epoch 15/50
190/190 - 5s - loss: 2.0902 - accuracy: 0.5081 - val_loss: 2.6041 - val_accuracy: 0.3550 - 5s/epoch - 27ms/step
Epoch 16/50
190/190 - 5s - loss: 2.0097 - accuracy: 0.5297 - val_loss: 2.5553 - val_accuracy: 0.3723 - 5s/epoch - 26ms/step
Epoch 17/50
190/190 - 5s - loss: 1.9305 - accuracy: 0.5437 - val_loss: 2.5117 - val_accuracy: 0.3718 - 5s/epoch - 25ms/step
Epoch 18/50
190/190 - 5s - loss: 1.8631 - accuracy: 0.5658 - val_loss: 2.4710 - val_accuracy: 0.3876 - 5s/epoch - 26ms/step
Epoch 19/50
190/190 - 5s - loss: 1.7963 - accuracy: 0.5814 - val_loss: 2.4410 - val_accuracy: 0.3936 - 5s/epoch - 26ms/step
Epoch 20/50
190/190 - 5s - loss: 1.7352 - accuracy: 0.5959 - val_loss: 2.4123 - val_accuracy: 0.4030 - 5s/epoch - 27ms/step
Epoch 21/50
190/190 - 5s - loss: 1.6798 - accuracy: 0.6119 - val_loss: 2.3765 - val_accuracy: 0.4045 - 5s/epoch - 26ms/step
Epoch 22/50
190/190 - 5s - loss: 1.6250 - accuracy: 0.6198 - val_loss: 2.3579 - val_accuracy: 0.4119 - 5s/epoch - 26ms/step
Epoch 23/50
190/190 - 5s - loss: 1.5733 - accuracy: 0.6333 - val_loss: 2.3347 - val_accuracy: 0.4183 - 5s/epoch - 26ms/step
Epoch 24/50
190/190 - 5s - loss: 1.5256 - accuracy: 0.6447 - val_loss: 2.3103 - val_accuracy: 0.4218 - 5s/epoch - 26ms/step
Epoch 25/50
190/190 - 5s - loss: 1.4811 - accuracy: 0.6550 - val_loss: 2.2931 - val_accuracy: 0.4243 - 5s/epoch - 26ms/step
Epoch 26/50
190/190 - 5s - loss: 1.4354 - accuracy: 0.6677 - val_loss: 2.2851 - val_accuracy: 0.4302 - 5s/epoch - 26ms/step
Epoch 27/50
190/190 - 5s - loss: 1.3956 - accuracy: 0.6800 - val_loss: 2.2620 - val_accuracy: 0.4327 - 5s/epoch - 26ms/step
Epoch 28/50
190/190 - 5s - loss: 1.3527 - accuracy: 0.6898 - val_loss: 2.2545 - val_accuracy: 0.4416 - 5s/epoch - 26ms/step
Epoch 29/50
190/190 - 5s - loss: 1.3137 - accuracy: 0.6998 - val_loss: 2.2431 - val_accuracy: 0.4322 - 5s/epoch - 25ms/step
Epoch 30/50
190/190 - 5s - loss: 1.2791 - accuracy: 0.7135 - val_loss: 2.2283 - val_accuracy: 0.4401 - 5s/epoch - 25ms/step
Epoch 31/50
190/190 - 5s - loss: 1.2434 - accuracy: 0.7182 - val_loss: 2.2083 - val_accuracy: 0.4441 - 5s/epoch - 26ms/step
Epoch 32/50
190/190 - 5s - loss: 1.2075 - accuracy: 0.7267 - val_loss: 2.2039 - val_accuracy: 0.4431 - 5s/epoch - 25ms/step
Epoch 33/50
190/190 - 5s - loss: 1.1752 - accuracy: 0.7333 - val_loss: 2.1934 - val_accuracy: 0.4569 - 5s/epoch - 26ms/step
Epoch 34/50
190/190 - 5s - loss: 1.1438 - accuracy: 0.7432 - val_loss: 2.1843 - val_accuracy: 0.4485 - 5s/epoch - 26ms/step
Epoch 35/50
190/190 - 5s - loss: 1.1126 - accuracy: 0.7513 - val_loss: 2.1746 - val_accuracy: 0.4535 - 5s/epoch - 26ms/step
Epoch 36/50
190/190 - 5s - loss: 1.0835 - accuracy: 0.7614 - val_loss: 2.1784 - val_accuracy: 0.4545 - 5s/epoch - 26ms/step
plot(history)
%>%
model evaluate(x_test, y_test)
64/64 - 1s - loss: 2.2635 - accuracy: 0.4342 - 1s/epoch - 19ms/step
loss accuracy
2.2634687 0.4341584
Generate predictions on new data.
<- model %>%
predictions predict(x_test) %>%
k_argmax()
64/64 - 2s - 2s/epoch - 24ms/step