laia-create-model

#!/usr/bin/env th

require 'laia'
assert(cudnn ~= nil, 'cuDNN is required by create_model')

local parser = laia.argparse(){
  name = 'create_model',
  description = 'Create a model for HTR composed by a set of convolution ' ..
    'blocks, followed by a set of bidirectional LSTM or GRU layers, and a ' ..
    'final linear layer. Each convolution block is composed by a ' ..
    '2D convolution layer, an optional batch normalization layer, ' ..
    'a non-linear activation function and an optional 2D max-pooling layer.' ..
    'Also, each block, rnn layer and the final linear layer may be preceded ' ..
    'by a dropout layer.'
}

parser:option(
  '--cnn_batch_norm',
  'Batch normalization before the activation in each conv layer.',
  {false}, laia.toboolean)
  :argname('<bool>')  -- Placeholder
  :args('+')          -- Option with >= 1 arguments

parser:option(
  '--cnn_dropout',
  'Dropout probability at the input of each conv layer, 0 <= p < 1.',
  {0}, tonumber)
  :argname('<p>')
  :args('+')
  :ge(0.0):lt(1.0)  -- Dropout must be in the range [0, 1)

parser:option(
  '--cnn_spatial_dropout',
  'Use spatial dropout at the input of each conv layer, instead of the ' ..
  'regular dropout.',
  {false}, laia.toboolean)
  :argname('<bool>')
  :args('+')

parser:option(
  '--cnn_num_features', 'Number of feature maps in each conv layer, n > 0',
  {16, 16, 32, 32}, laia.toint)
  :argname('<n>')
  :args('+')
  :gt(0)            -- Number of features must be > 0

parser:option(
  '--cnn_maxpool_size', 'MaxPooling size after each conv layer. Separate ' ..
  'each dimension with commas (order: width,height).',
  {{2,2}, {2,2}, {0}, {2,2}}, laia.tolistint)
  :argname('<size>')
  :args('+')
  :assert(function(t) return table.all(t, function(x) return x >= 0 end) end)
  :tostring(function(x) return table.concat(table.map(x, tostring), ',') end)

parser:option(
  '--cnn_kernel_size', 'Kernel size of each conv layer. Separate each ' ..
  'dimension with commas (order: width,height).',
  {{3,3}, {3,3}, {3,3}, {3,3}}, laia.tolistint)
  :argname('<size>')
  :args('+')
  :assert(function(t) return table.all(t, function(x) return x > 0 end) end)
  :tostring(function(x) return table.concat(table.map(x, tostring), ',') end)

parser:option(
  '--cnn_type',
  'Type of the activation function in each conv layer, valid types are ' ..
  'relu, tanh, prelu, rrelu, leakyrelu, softplus.',
  {'relu'}, {relu = 'relu',
	     tanh = 'tanh',
	     prelu = 'prelu',
	     rrelu = 'rrelu',
	     leakyrelu = 'leakyrelu',
	     softplus = 'softplus'})
  :argname('<type>')
  :args('+')

parser:option(
  '--rnn_dropout',
  'Dropout probability at the input of each recurrent layer, 0 <= p < 1.',
  0.5, tonumber)
  :argname('<p>')
  :ge(0.0):lt(1.0)

parser:option(
  '--rnn_num_layers',
  'Number of recurrent layers, n > 0.', 3, laia.toint)
  :argname('<n>')
  :gt(0)

parser:option(
  '--rnn_num_units',
  'Number of units the recurrent layers, n > 0.', 256, laia.toint)
  :argname('<n>')
  :gt(0)

parser:option(
  '--rnn_type',
  'Type of the recurrent layers, valid types are blstm, bgru.',
  'blstm', {blstm = 'blstm', bgru = 'bgru'})
  :argname('<type>')

parser:option(
  '--linear_dropout',
  'Dropout probability at the input of the final linear layer, 0 <= p < 1.',
  0.5, tonumber)
  :argname('<p>')
  :ge(0.0):lt(1.0)

parser:option(
  '--seed -s', 'Seed for random numbers generation.',
  0x012345, laia.toint)

-- Arguments
parser:argument(
  'input_channels', 'Number of channels of the input images.')
  :convert(laia.toint)
  :gt(0)
parser:argument(
  'input_height', 'Height of the input images.')
  :convert(laia.toint)
  :gt(0)
parser:argument(
  'output_size',
  'Number of output symbols. If you are going to use the CTC ' ..
  'loss include one additional element!')
  :convert(laia.toint)
  :gt(0)
parser:argument(
  'output_file', 'Output file to store the model')

-- Register laia.Version options
laia.Version():registerOptions(parser)
-- Register logging options
laia.log.registerOptions(parser)

local opt = parser:parse()

-- The number of conv layers is determined by the number of elements in the
-- --cnn_num_features option.
local cnn_layers = #opt.cnn_num_features

-- Ensure that all options for the convolutional layers have the same length
-- (equal to the number of specified layers). The last option in a list is
-- copied to extend the list until a size of cnn_layers is achieved.
table.append_last(opt.cnn_kernel_size, cnn_layers - #opt.cnn_kernel_size)
table.append_last(opt.cnn_maxpool_size, cnn_layers - #opt.cnn_maxpool_size)
table.append_last(opt.cnn_batch_norm, cnn_layers - #opt.cnn_batch_norm)
table.append_last(opt.cnn_type, cnn_layers - #opt.cnn_type)
table.append_last(opt.cnn_dropout, cnn_layers - #opt.cnn_dropout)
table.append_last(opt.cnn_spatial_dropout,
		  cnn_layers - #opt.cnn_spatial_dropout)

-- Kernel sizes must be pairs of integers
opt.cnn_kernel_size = table.map(
  opt.cnn_kernel_size, function(x) return table.append_last(x, 2 - #x) end)

-- Maxpool sizes must be pairs of integers
opt.cnn_maxpool_size = table.map(
  opt.cnn_maxpool_size, function(x) return table.append_last(x, 2 - #x) end)

-- Initialize random seeds
laia.manualSeed(opt.seed)

-- Auxiliar function that creates convolutional block
local function convBlock(
    depth_in, depth_out,  -- Input & output channels/filters
    kernel_w, kernel_h,   -- Size of the convolution kernels
    pool_w, pool_h,       -- Size of the pooling windows
    activation, batch_norm, dropout, spatial_dropout)
  activation = activation or 'relu'
  batch_norm = batch_norm or false
  dropout = dropout or 0
  spatial_dropout = spatial_dropout or false
  local block = nn.Sequential()
  -- Spatial dropout to the input of the convolutional block
  if dropout > 0 then
    if spatial_dropout then
      block:add(nn.SpatialDropout(dropout))
    else
      block:add(nn.Dropout(dropout))
    end
  end
  -- Spatial 2D convolution. Image is padded with zeroes so that the output
  -- has the same size as the input / stride.
  block:add(nn.SpatialConvolution(
	      depth_in, depth_out,
	      kernel_w, kernel_h,
	      1, 1,
	      math.ceil((kernel_w - 1) / 2), math.ceil((kernel_h - 1) / 2)))
  -- Batch normalization
  if batch_norm then
    block:add(nn.SpatialBatchNormalization(depth_out))
  end
  -- Activation function
  if activation == 'relu' then
    block:add(nn.ReLU(true))
  elseif activation == 'tanh' then
    block:add(nn.Tanh())
  elseif activation == 'leakyrelu' then
    block:add(nn.LeakyReLU(true))
  elseif activation == 'softplus' then
    block:add(nn.SoftPlus())
  elseif activation == 'prelu' then
    block:add(nn.PReLU())
  elseif activation == 'rrelu' then
    block:add(nn.RReLU(1.0 / 8.0, 1.0 / 3.0, true))
  else
    assert(false, string.format('Unknown activation function %s', activation))
  end
  -- Max pooling
  if pool_w > 0 and pool_h > 0 then
    block:add(nn.SpatialMaxPooling(pool_w, pool_h, pool_w, pool_h))
  end
  return block
end

local function computeSizeAfterPooling(input_size, pool_size)
  if pool_size < 2 then
    return input_size
  else
    return math.floor((input_size - pool_size) / pool_size + 1)
  end
end

local model = nn.Sequential()
-- Used to compute the height and depth of the images after all the convolutions
local curr_h = opt.input_height
local curr_c = opt.input_channels
-- Append convolutional layer blocks
for i=1,cnn_layers do
  model:add(convBlock(curr_c, opt.cnn_num_features[i],
		      opt.cnn_kernel_size[i][1], opt.cnn_kernel_size[i][2],
		      opt.cnn_maxpool_size[i][1], opt.cnn_maxpool_size[i][2],
		      opt.cnn_type[i], opt.cnn_batch_norm[i],
		      opt.cnn_dropout[i], opt.cnn_spatial_dropout[i]))
  curr_h = computeSizeAfterPooling(curr_h, opt.cnn_maxpool_size[i][2])
  curr_c = opt.cnn_num_features[i]
end
-- Append recurrent layers
local rnn_input_dim = curr_c * curr_h
-- Convert images to 1D sequences by processing columns of the image as the
-- sequence elements.
model:add(laia.nn.ImageColumnSequence())
-- Append recurrent layers
if opt.rnn_type == 'blstm' then
  model:add(cudnn.BLSTM(rnn_input_dim, opt.rnn_num_units, opt.rnn_num_layers,
			  false, opt.rnn_dropout))
else
  model:add(cudnn.BGRU(rnn_input_dim, opt.rnn_num_units, opt.rnn_num_layers,
		       false, opt.rnn_dropout))
end
-- Linear projection of each timestep and batch sample (LxNxD -> (LN)xD)
model:add(nn.Reshape(-1, opt.rnn_num_units * 2, false))
if opt.linear_dropout > 0 then
  model:add(nn.Dropout(opt.linear_dropout))
end
model:add(nn.Linear(opt.rnn_num_units * 2, opt.output_size))
model:float()

-- Save model to disk
local checkpoint = laia.Checkpoint()
checkpoint:setModelConfig(opt)
checkpoint:Best():setModel(model)
checkpoint:Last():setModel(model)
checkpoint:save(opt.output_file)

local p, _ = model:getParameters()
laia.log.info('\n' .. model:__tostring__())
laia.log.info('Saved model with %d parameters to %q',
	      p:nElement(), opt.output_file)