Merge branch 'develop'

doug919 · Feb 13, 2015 · 9f5e9a2 · 9f5e9a2
2 parents 695a735 + 00149f5
commit 9f5e9a2
Show file tree

Hide file tree

Showing 64 changed files with 1,942 additions and 131 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,10 @@ dev/*
 *models*
 tmp
 data
+MKLv2/exp_1/log/log*
+MKLv2/exp_2/log/log*
+MKLv2/exp_3/log/log*
+MKLv2/exp_4/log/log*
+MKLv2/drawing/output/*.png
+MKLv2/drawing/output/*.csv
+
diff --git a/MKLv2/README b/MKLv2/README
@@ -0,0 +1,61 @@
+
+
+I. Experiement Design
+
+    There is a slide, in the doc folder, that draws the flow aof these experiments. 
+
+    Experiment 1: 1-feature-vector
+        features = {'TFIDF', 'keyword', 'image_rgba_gist', 'image_rgba_phog'}
+        Gamma = [0.1 1 10 15 20]
+        variables = 1 all, 9 random
+
+    Experiment 2: 2-feature-vector
+        features = {'TFIDF+keyword', 'image_rgba_gist+image_rgba_phog'}
+        Gamma = [0.1 1 10 15 20]
+        variables = 1 all, 9 random
+
+    Experiment 4: 4-feature-vector (20 kernels)
+        features = {'TFIDF+keyword+image_rgba_gist+image_rgba_phog'}
+        Gamma = [0.1 1 10 15 20]
+        variables = all for each feature
+
+
+II. Run Experiments
+
+    1. go to experiment folder, e.g. exp_1
+    2. edit batch file, e.g. single_exe.m or parallel_exe.py
+    3. execute the batch file
+
+
+III. Run Experiment with same Seed (Only for single execution)
+
+    This seed is designed to let the experiment can be reproduced so that
+    we can easily debug or re-run our experiment based on the same datasets.
+
+    Steps of generating seed: 
+        1. matlab -r common/batchGenSeed.m:
+        2. copy the fixed_exp_seed.mat to the destinated experiment folder, 
+           like 'exp_1' (don't change the file name, the experiment only recognize this file name)
+        3. execute the experiment
+
+IV. Ploting
+
+    (1) Bar Chart
+
+    1. Use common/mklv2_test_result_to_csv.m to transfer .mat output to .   csv. You can refer drawing/batchExp1EvalToCsv.m for example.
+
+        Note 2: 
+            You could modify the input file name in the drawing/batchExp1EvalToCsv.m to do what you want.
+
+    2. Use drawing/batchPlotExp1FromCsv.py to plot bar chart.
+
+        Note 1: 
+            I use matplotlib in Enthought Canopy python package on Win7 to plot the chart.
+        Note 2: 
+            You should modify the input csv file name in the drawing/batchPlotExp1FromCsv.py to do what you want.
+
+    3. Exp2, Exp3, Exp4 use the similar philosophy.
+
+
+
+
diff --git a/MKLv2/common/batchGenSeed.m b/MKLv2/common/batchGenSeed.m
@@ -0,0 +1,9 @@
+
+
+output_file = 'fixed_exp_seed.mat'
+n_data = 1600;  % training data
+npos = n_data/2; ;
+nneg = npos;
+
+mklv2_make_seed(output_file, npos, nneg)
+
diff --git a/MKLv2/common/mklv2_CreateKernelListWithVariable.m b/MKLv2/common/mklv2_CreateKernelListWithVariable.m
@@ -0,0 +1,47 @@
+function [kernelcellaux,kerneloptioncellaux,variablecellaux]=CreateKernelListWithVariable(variablecell,dim,kernelcell,kerneloptioncell,feature_start_idx)
+
+
+j=1;
+for i=1:length(variablecell)
+    optnum = str2num(variablecell{i});
+    if size(optnum) == [1 1]
+        varopt = 'number';
+    else
+        varopt = variablecell{i};
+    end
+
+    switch varopt
+        case 'all'
+            kernelcellaux{j}=kernelcell{i};
+            kerneloptioncellaux{j}=kerneloptioncell{i};
+            variablecellaux{j}=1:dim;
+            j=j+1;    
+        case 'single'
+            for k=1:dim
+                kernelcellaux{j}=kernelcell{i};
+                kerneloptioncellaux{j}=kerneloptioncell{i};
+                variablecellaux{j}=k;
+                j=j+1;
+            end;    
+    	case 'random'
+    		kernelcellaux{j}=kernelcell{i};
+            kerneloptioncellaux{j}=kerneloptioncell{i};
+    		indicerand=randperm(dim);
+    		nbvarrand=floor(rand*dim)+1;         
+       		variablecellaux{j}=indicerand(1:nbvarrand);
+            j=j+1;
+        case 'number'
+            kernelcellaux{j}=kernelcell{i};
+            kerneloptioncellaux{j}=kerneloptioncell{i};
+            n_feature_type = size(feature_start_idx, 2);
+            if n_feature_type > optnum              % middle
+                variablecellaux{j}=feature_start_idx(optnum):feature_start_idx(optnum+1)-1;
+            elseif n_feature_type == optnum         % last
+                variablecellaux{j}=feature_start_idx(optnum):dim;
+            else
+                error('incorrect feature idx');
+            end             
+            j=j+1; 
+    end;
+end;
+variablecellaux
diff --git a/MKLv2/common/mklv2_build_kernel.m b/MKLv2/common/mklv2_build_kernel.m
@@ -0,0 +1,15 @@
+function [K_train, weight, info_kernel, Xnorm_train, Xnorm_test] = mklv2_build_kernel(kernel_param, dim, X_train, X_test, options, feature_start_idx)
+
+if nargin < 6
+    feature_start_idx = [];
+end
+
+disp('Building training kernel...')
+[weight, info_kernel, Xnorm_train, Xnorm_test] = ...
+        mklv2_preprocessing(kernel_param, dim, X_train, X_test, feature_start_idx);
+K_train = mklkernel(Xnorm_train, info_kernel, weight, options);
+
+disp(sprintf('weight is %ld x %ld', size(weight, 1), size(weight, 2)));
+disp(sprintf('info_kernel is %ld x %ld', size(info_kernel, 1), size(info_kernel, 2)));
+disp(sprintf('K_train is %ld x %ld x %ld\n', size(K_train, 1), size(K_train, 2), size(K_train,3))); 
+
diff --git a/MKLv2/common/mklv2_config.m b/MKLv2/common/mklv2_config.m
@@ -0,0 +1,62 @@
+% Example of how to use the mklsvm for  classification
+%
+%
+
+
+%PROJECT_ROOT = '/home/doug919/projects/github_repo/LJ40K/MKLv2';
+DATA_ROOT = '/home/doug919/projects/data/MKLv2';
+addpath('/tools/SimpleMKL');
+addpath('/tools/SVM-KM');
+
+options.algo = 'svmclass'; % Choice of algorithm in mklsvm can be either
+                         % 'svmclass' or 'svmreg'
+%------------------------------------------------------
+% choosing the stopping criterion
+%------------------------------------------------------
+options.stopvariation = 0; % use variation of weights for stopping criterion 
+options.stopKKT = 0;       % set to 1 if you use KKTcondition for stopping criterion    
+options.stopdualitygap = 1; % set to 1 for using duality gap for stopping criterion
+
+%------------------------------------------------------
+% choosing the stopping criterion value
+%------------------------------------------------------
+options.seuildiffsigma = 1e-2;        % stopping criterion for weight variation 
+options.seuildiffconstraint = 0.1;    % stopping criterion for KKT
+options.seuildualitygap = 0.01;       % stopping criterion for duality gap
+
+%------------------------------------------------------
+% Setting some numerical parameters 
+%------------------------------------------------------
+options.goldensearch_deltmax = 1e-1; % initial precision of golden section search
+options.numericalprecision = 1e-8;   % numerical precision weights below this value
+                                   % are set to zero 
+options.lambdareg = 1e-8;          % ridge added to kernel matrix 
+
+%------------------------------------------------------
+% some algorithms paramaters
+%------------------------------------------------------
+options.firstbasevariable = 'first'; % tie breaking method for choosing the base 
+                                   % variable in the reduced gradient method 
+options.nbitermax = 500;             % maximal number of iteration  
+options.seuil = 0;                   % forcing to zero weights lower than this 
+options.seuilitermax = 10;           % value, for iterations lower than this one 
+
+options.miniter = 0;                 % minimal number of iterations 
+options.verbosesvm = 0;              % verbosity of inner svm algorithm 
+
+%
+% Note: set 1 would raise the `strrep`
+%       error in vectorize.dll
+%       and this error is not able to fix
+%       because of the missing .h libraay files
+% Modify: MaxisKao @ Sep. 4 2014
+options.efficientkernel = 0;         % use efficient storage of kernels 
+
+%------------------------------------------------------------------------
+%                               Misc
+%------------------------------------------------------------------------
+classcode = [1 -1];;
+% get emotion vector located at data root folder
+emotions = util_read_csv(fullfile(DATA_ROOT, 'emotion.csv'));
+
+rng('shuffle');
diff --git a/MKLv2/common/mklv2_get_best_param.m b/MKLv2/common/mklv2_get_best_param.m
@@ -0,0 +1,8 @@
+function [best_param_C] = mklv2_get_best_param(result)
+
+% find the best performance index
+max_bc = max([result.bc{:}]);
+max_idx = find([result.bc{:}] == max_bc);
+
+% if there are multiple instances I choose the largest one, since larger C usually performs better.
+best_param_C = result.svm_C{max_idx(length(max_idx))};
diff --git a/MKLv2/common/mklv2_get_binary_vector.m b/MKLv2/common/mklv2_get_binary_vector.m
@@ -0,0 +1,15 @@
+function [y_processed] = mklv2_get_binary_vector(y, emotion)
+
+for i=1:length(y)
+    cmplen = min(length(y(i, :)), length(emotion));
+
+    if strncmp(y(i, :), emotion, cmplen)
+        y_processed(i) = 1;
+    else
+        y_processed(i) = -1;
+    end
+
+end
+
+y_processed = transpose(y_processed);
+
diff --git a/MKLv2/common/mklv2_kfold.m b/MKLv2/common/mklv2_kfold.m
@@ -0,0 +1,16 @@
+function [X_train, y_train, X_dev, y_dev] = mklv2_kfold(X, y, group_indices, group_id)
+
+% call crossvalind('Kfold', y, 10) to generate variable group_indices
+% then use this function to separate train/dev by group_id
+
+dev_bin_idx = (group_indices==group_id);
+train_bin_idx = ~dev_bin_idx;
+
+X_dev = X(dev_bin_idx, :);
+y_dev = y(dev_bin_idx, :);
+
+X_train = X(train_bin_idx, :);
+y_train = y(train_bin_idx, :);
+
+
+
diff --git a/MKLv2/common/mklv2_load_data.m b/MKLv2/common/mklv2_load_data.m
@@ -0,0 +1,6 @@
+function [X_return, y_return] =  mklv2_load_data(file_path)
+
+disp(sprintf('==> load from %s', file_path));
+load(file_path);
+X_return = X;
+y_return = y;
diff --git a/MKLv2/common/mklv2_load_multiple_features.m b/MKLv2/common/mklv2_load_multiple_features.m
@@ -0,0 +1,27 @@
+function [X_fused, y_fused, start_idx] = mklv2_load_multiple_features(cells_sample_path)
+
+%{
+    cells_sample_path: cells of string; e.g. {'/home/doug919/a.mat' '/home/doug919/b.mat'}
+%}
+
+X_fused = [];
+y_fused = [];
+start_idx = [];
+next_start_idx = 1;
+
+for i=1:length(cells_sample_path)
+    disp(sprintf('==> loading data from %s', cells_sample_path{i}));
+    load(cells_sample_path{i});
+    size_of_X = size(X)
+
+    if size(y_fused) == [0 0]
+        y_fused = y;
+    elseif ~isequal(y_fused, y)
+        error('y is not matched');
+    end
+
+    X_fused = cat(2, X_fused, X);
+    start_idx = cat(2, start_idx, next_start_idx);
+    next_start_idx = size(X_fused, 2)+1;
+end
+
diff --git a/MKLv2/common/mklv2_load_seed.m b/MKLv2/common/mklv2_load_seed.m
@@ -0,0 +1,11 @@
+
+
+seed_file_name = 'fixed_exp_seed.mat';
+
+if exist(seed_file_name)
+    disp('use last seed');
+    load(seed_file_name);
+else
+    seed = [];
+end
+
diff --git a/MKLv2/common/mklv2_make_seed.m b/MKLv2/common/mklv2_make_seed.m
@@ -0,0 +1,6 @@
+function [] = mklv2_make_seed(output_file, npos, nneg)
+
+seed.positive = randperm(npos);;
+seed.negative = randperm(nneg);
+
+save(output_file, 'seed')
diff --git a/MKLv2/common/mklv2_preprocessing.m b/MKLv2/common/mklv2_preprocessing.m
@@ -0,0 +1,12 @@
+function [weight, info_kernel, X_normalized_train, X_normalized_dev] = mklv2_preprocessing(kernel_param, dim, X_train, X_dev, feature_start_idx)
+
+if nargin < 5
+    feature_start_idx = [];
+end
+
+[kernel_type_vec, kernel_option_vec, kernel_var_vec_cell] = ...
+    mklv2_CreateKernelListWithVariable(kernel_param.variable_vec, dim, kernel_param.type_vec, kernel_param.option_vec, feature_start_idx);
+
+[X_normalized_train, X_normalized_dev] = normalizemeanstd(X_train, X_dev);
+[weight, info_kernel] = ...
+    UnitTraceNormalization(X_normalized_train, kernel_type_vec, kernel_option_vec, kernel_var_vec_cell);
diff --git a/MKLv2/common/mklv2_separate_samples.m b/MKLv2/common/mklv2_separate_samples.m
@@ -0,0 +1,62 @@
+function [X_train, y_train, X_test, y_test, aux] = mklv2_separate_samples(X, y, n_train, aux, classcode)
+
+%{
+
+Function:
+    mklv2_separate_samples
+
+Description:
+    separate samples into training and testing sets
+
+Input:
+    X          : data
+    y          : labels
+    n_train    : must have two elements, number of [positive, negtive] samples in training set
+    aux        : structure; permutation of indexes; if not specified this function would generate a new one
+        .positive   : n_train(1) permutatoin of indexes
+        .negative   : n_train(2) permutatoin of indexes
+    classcode  : label representations; default value is [1 -1]
+
+Output:
+    X_train    : data of training samples
+    y_train    : labels of training samples
+    X_test                : training time in seconds
+    y_test               : the weights
+
+%}
+
+if nargin<5
+    classcode(1) = 1;
+    classcode(2) = -1;
+end;
+
+if length(n_train)~=2
+    error('parameter n_train must have two elements.');
+else
+    n_train_pos = n_train(1);
+    n_train_neg = n_train(2);
+    idx_pos = find(y==classcode(1));
+    idx_neg = find(y==classcode(2));
+    n_pos = length(idx_pos);
+    n_neg = length(idx_neg);
+
+    if nargin< 4 || size(aux, 1) == 0
+        aux_pos = randperm(n_pos);
+        aux.positive = aux_pos;
+        aux_neg = randperm(n_neg);   
+        aux.negative = aux_neg;
+    elseif length(aux.positive)~=n_pos || length(aux.negative)~=n_neg
+        error('unmatched number of aux samples');
+    else
+        aux_pos = aux.positive;
+        aux_neg = aux.negative;
+    end
+    idx_train = [idx_pos(aux_pos(1:n_train_pos)); idx_neg(aux_neg(1:n_train_neg))];
+    idx_test = [idx_pos(aux_pos(n_train_pos+1:end)) ; idx_neg(aux_neg(n_train_neg+1:end))];
+    X_train = X(idx_train,:);
+    y_train = y(idx_train);
+    X_test = X(idx_test,:);
+    y_test = y(idx_test,:);
+end
+
+