The fastai library
simplifies training fast and accurate neural nets using modern best
practices. See the fastai website to get started. The library is based
on research into deep learning best practices undertaken at
fast.ai
, and includes “out of the box” support for
vision
, text
, tabular
, and
collab
(collaborative filtering) models.
Interesting posts about NN from scratch using R:
To be able to give examples of optimizer steps, we will need some steppers, like the following:
library(magrittr)
library(fastai)
tst_param = function(val, grad = NULL) {
"Create a tensor with `val` and a gradient of `grad` for testing"
res = tensor(val) %>% float()
if(is.null(grad)) {
grad = tensor(val / 10)
} else {
grad = tensor(grad)
}
res$grad = grad %>% float()
res
}
p = tst_param(1., 0.1)
p
tensor(1.)
sgd_step(p, 1.)
p
tensor(0.9000)
p$grad
tensor(0.1000)
This method will loop over all param groups, then all parameters for
which grad
is not NULL
and call each function
in stepper
, passing it the parameter p with the
hyper-parameters in the corresponding dict in hypers
.
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr=0.1)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr=0.1)
try(params[3]$grad <- NULL,
TRUE)
params[3]$grad
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(3.)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(list(params[0:1],params[2:3]), sgd_step, lr=0.1)
opt$hypers$items[[1]][[1]] = 0.01
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9990)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1)
opt$zero_grad()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(1.)
$ :tensor(2.)
$ :tensor(3.)
Keeps track of the avg grads of p
in state
with mom
.
`dampening = FALSE
gives the classical formula for
momentum in SGD:
whereas dampening = TRUE
makes it an exponential moving
average:
p = tst_param(c(1,2,3), c(4,5,6))
state = average_grad(p, mom = 0.9, dampening = FALSE, grad_avg = NULL)
p$grad
# tensor([4., 5., 6.])
state = average_grad(p, mom=0.9, dampening = TRUE)
p$grad*0.1
# tensor([0.4000, 0.5000, 0.6000])
p$grad*(0.1*0.9+0.1)
# tensor([0.7600, 0.9500, 1.1400])
dampening = FALSE
gives the classical formula for
momentum in SGD:
whereas dampening = TRUE
makes it an exponential moving
average:
p = tst_param(c(1,2,3), c(4,5,6))
state = average_sqr_grad(p, sqr_mom = 0.99, dampening = FALSE)
p$grad$pow(2)
# tensor([16., 25., 36.])
p$grad$pow(2) * 1.99
# tensor([31.8400, 49.7500, 71.6400])
state = average_sqr_grad(p, sqr_mom = 0.99)
p$grad$pow(2) * 1e-2
# tensor([0.1600, 0.2500, 0.3600])
state = average_sqr_grad(p, sqr_mom = 0.99)
p$grad$pow(2)*(0.01*0.99+0.01)
# tensor([0.3184, 0.4975, 0.7164])
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr = 0.1)
opt$freeze_to(1L)
A Optimizer
for SGD with lr
and
mom
and params
.
Optional weight decay of wd
is applied, as true weight
decay (decay the weights directly) if decouple_wd = TRUE
else as L2
regularization (add the decay to the
gradients).
params = L(lapply(0:3, function(x) tst_param(x)))
opt = SGD(params, lr = 0.1)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = SGD(params, lr = 0.1, mom = 0.9)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
Test weight decay, notice how we can see that L2
regularization is different from weight decay even for simple SGD with
momentum.
params = L(lapply(0:3, function(x) tst_param(x)))
#Weight decay
opt = SGD(params, lr=0.1, mom=0.9, wd=0.1)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
params = L(lapply(0:3, function(x) tst_param(x)))
#L2 reg
opt = SGD(params, lr=0.1, mom=0.9, wd=0.1, decouple_wd=FALSE)
opt$step()
str(params$items)
List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
A Optimizer
for RMSProp
with
lr
, sqr_mom
, mom
and
params
.
RMSProp
was introduced by Geoffrey Hinton in his course.
What is named sqr_mom
here is the alpha
in the
course. Optional weight decay of wd
is applied, as true
weight decay (decay the weights directly) if
decouple_wd = TRUE
else as L2 regularization (add the decay
to the gradients).
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = RMSProp(params, lr=0.1)
opt$step()
opt$step()
step = (-0.1 * 0.1) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8)
params; tensor(c(step, 1+step, 2+step))
tensor([-0.7089, 0.2911, 1.2911])
tensor([-0.7089, 0.2911, 1.2911])
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = RMSProp(params, lr=0.1, mom=0.9)
opt$step()
opt$step()
step = (- 0.1 * (0.1 + 0.9*0.1)) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8)
params; tensor(c(step, 1+step, 2+step))
tensor([-1.3469, -0.3469, 0.6531])
tensor([-1.3469, -0.3469, 0.6531])
A Optimizer for Adam with lr
, mom
,
sqr_mom
, eps
and params
.
Adam was introduced by Diederik P. Kingma and Jimmy Ba in Adam: A
Method for Stochastic Optimization. For consistency across optimizers,
we renamed beta1
and beta2
in the paper to
mom
and sqr_mom
. Note that our defaults also
differ from the paper (0.99 for sqr_mom
or
beta2
, 1e-5 for eps
). Those values seem to be
better from our experiments in a wide range of situations.
Optional weight decay of wd
is applied, as true weight
decay (decay the weights directly) if decouple_wd=TRUE
else
as L2
regularization (add the decay to the gradients).
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = Adam(params, lr=0.1, wd=0)
opt$step()
step = (-0.1 * 0.1) / (sqrt(0.1**2) + 1e-8)
params; tensor(c(1+step, 2+step, 3+step))
tensor([0.9000, 1.9000, 2.9000])
tensor([0.9000, 1.9000, 2.9000])
tensor([0.8000, 1.8000, 2.8000])
tensor([0.8000, 1.8000, 2.8000])
beta = 0.99
r_inf = 2/(1-beta) - 1
rs = lapply(5:500, function(s) {r_inf - 2*s*beta**s/(1-beta**s)}) %>% as.numeric()
v = sqrt(((rs-4) * (rs-2) * r_inf)/((r_inf-4)*(r_inf-2)*rs))
df_high = data.frame(x = 1:length(v), y = v)
library(highcharter)
hchart(df_high,'line', hcaes(x,y))
An Optimizer for Adam with lr
, mom
,
sqr_mom
, nus
, eps
and
params
.
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = QHAdam(params, lr=0.1)
opt$step()
step = (-0.1 * (((1-0.7) * 0.1) + (0.7 * 0.1)) )/ (
sqrt(((1-1.0) * 0.1**2) + (1.0 * 0.1**2)) + 1e-8)
params; tensor(c(1+step, 2+step, 3+step))
# tensor([0.9000, 1.9000, 2.9000])
# tensor([0.9000, 1.9000, 2.9000])
opt$step()
params; tensor(c(1+2*step, 2+2*step, 3+2*step))
# tensor([0.8000, 1.8000, 2.8000])
# tensor([0.8000, 1.8000, 2.8000])
A Optimizer
for Adam
with lr
,
mom
, sqr_mom
, eps
and
params
.
The LARS optimizer was first introduced in Large Batch Training of
Convolutional Networks then refined in its LARC variant (original LARS
is with clip=FALSE
). A learning rate is computed for each
individual layer with a certain trust_coefficient
, then
clipped to be always less than lr
.
Optional weight decay of wd
is applied, as true weight
decay (decay the weights directly) if decouple_wd = TRUE
else as L2
regularization (add the decay to the
gradients).
params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03)))
opt = Larc(params, lr=0.1)
opt$step()
#First param local lr is 0.02 < lr so it's not clipped
opt$state[params[[1]]]['local_lr']
$local_lr
tensor(0.0200)
opt$state[params[[2]]]['local_lr']
$local_lr
[1] 0.1
params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03)))
opt = Larc(params, lr=0.1, clip = FALSE)
opt$step()
#Second param local lr is 0.2 > lr so it's clipped
opt$state[params[[1]]]['local_lr']
$local_lr
tensor(0.0200)
opt$state[params[[2]]]['local_lr']
$local_lr
tensor(0.2000)
A Optimizer
for Adam with lr
,
mom
, sqr_mom
, eps
and
params
.
LAMB was introduced in Large Batch Optimization for
Deep Learning: Training BERT in 76 minutes. Intuitively, it’s LARC
applied to Adam. As in Adam, we renamed beta1 and beta2 in the paper to
mom and sqr_mom. Note that our defaults also differ from the paper (0.99
for sqr_mom
or beta2
, 1e-5 for
eps
). Those values seem to be better from our experiments
in a wide range of situations.
Optional weight decay of wd
is applied, as true weight
decay (decay the weights directly) if decouple_wd=TRUE
else
as L2 regularization (add the decay to the gradients).
tensor([0.7840, 1.7840, 2.7840])
params = tst_param(c(1:3), c(0.1,0.2,0.3))
p = params$data$clone()
g = tensor(c(0.1,0.2,0.3))
opt = Lookahead(SGD(params, lr=0.1))
for(i in 1:5) {
opt$step()
}
#first 5 steps are normal SGD steps
params; p - g * 0.5
# tensor([0.9500, 1.9000, 2.8500])
# tensor([0.9500, 1.9000, 2.8500])
#Since k=6, sixth step is a moving average of the 6 SGD steps with the initial weight
opt$step()
params; p * 0.5 + (p-g*0.6) * 0.5
# tensor([0.9700, 1.9400, 2.9100])
# tensor([0.9700, 1.9400, 2.9100])