%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%
%%% Reconstructing TS (time in storage) pdf based on pdfs of TU (time in 
%%% use) and TH (time at home) specified as frequencies over discrete bins
%%%
%%% Dmitry Yumashev, 15/11/2019
%%% Dmitry Yumashev, 19/11/2019
%%% Dmitry Yumashev, 20/11/2019
%%% Dmitry Yumashev, 21/11/2019
%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

clear all %#ok<*CLALL>

%%% general settings

step_fitting = 0.5; % required new timestep after re-binning

scaling_factor_samples = 1e+6; % this needs to be large to improve the accuracy of fitting

%%% options for the fmincon solver (nonlinear programming)

options = optimoptions(@fmincon,'Algorithm','interior-point','OptimalityTolerance',1e-10,'StepTolerance',1e-12,'MaxFunctionEvaluations',1e+4,'MaxIterations',1e+4,'PlotFcn','optimplotx','UseParallel',1);


%%% input and outpiut files

folder = 'time';

filename_output = [folder '/all_cdfs.xlsx'];

filename_tu = [folder '/TU_freq2.csv']; % this is age of used devices measured from the year of purchase!
filename_tuu = [folder '/TUU_freq2.csv']; % this is age of unused devices measured from the year of purchase!

 
%%% read the data sample

[nums_tu, ~, raw_tu] = xlsread(filename_tu);  
[nums_tuu, ~, raw_tuu] = xlsread(filename_tuu); 

%%%

all_unu_keys = nums_tu(:,3);
unu_keys = unique(all_unu_keys);
total_unu_keys = length(unu_keys);

%
all_product_descriptions = raw_tu(2:end,4); % NOTE: this includes the label in the first row, so we need to offset it!


%%% manually enter the time steps

time_steps = transpose([0 1 3 6 10 15 20 40]); % define as a column 
total_time_steps = length(time_steps);

% 

bin_widths = time_steps(2:end) - time_steps(1:end-1); 

%

all_freq_tu = nums_tu(:,6);
all_freq_tuu = nums_tuu(:,6);

%

time_steps_interp = transpose(time_steps(1):step_fitting:time_steps(end)); % define as a column
total_time_steps_interp = length(time_steps_interp);

%

bin_widths_interp = time_steps_interp(2:end) - time_steps_interp(1:end-1); 

%

time_steps_continuous_plotting = transpose(time_steps(1):0.1:time_steps(end)); % define as a column

%%%

through_count_products = 0;

for k = 1:total_unu_keys
    
    current_unu_key = unu_keys(k);
        
    current_rows = find(all_unu_keys == current_unu_key);
    
    %
    
%     if k==2
%        
%         disp('debugging...')
%         
%     end
    
    product_descriptions = unique(all_product_descriptions(current_rows));
    
    for p=1:length(product_descriptions) % some UNU keys have more than one product
    
        current_product_description = product_descriptions(p); 
        
        % update the selection of rows to include the selected product
        % within the given unu key (subset of the initial selection of
        % rows)
        
        current_rows = find((all_unu_keys == current_unu_key) .* (strcmp(all_product_descriptions, current_product_description) == 1));

        %%%

        freq_tu = all_freq_tu(current_rows);

        freq_tuu = all_freq_tuu(current_rows);


        %%% reconstructing cdfs

        cdf_tu = zeros(total_time_steps,1);
        cdf_tuu = zeros(total_time_steps,1);

        cdf_tu(1) = 0;
        cdf_tuu(1) = 0;

        for i=2:total_time_steps

            delta = time_steps(i) - time_steps(i-1);

            cdf_tu(i) = cdf_tu(i-1) + freq_tu(i-1) * delta;
            cdf_tuu(i) = cdf_tuu(i-1) + freq_tuu(i-1) * delta;          
            
        end

        % normalising to 1 and reconstructing pdfs in the process

        scale_tu = cdf_tu(end);
        %
        cdf_tu = cdf_tu / scale_tu;
        pdf_tu = freq_tu / scale_tu; 

        scale_tuu = cdf_tuu(end);
        %
        cdf_tuu = cdf_tuu / scale_tuu;
        pdf_tuu = freq_tuu / scale_tuu; 

        
        %%% now interpolate the cdfs using the specified step for the
        %%% fitting
                       
        cdf_tu_interp = interp1(time_steps, cdf_tu, time_steps_interp, 'linear'); 
        cdf_tuu_interp = interp1(time_steps, cdf_tuu, time_steps_interp, 'linear'); 
        
        pdf_tu_interp = zeros(total_time_steps_interp-1,1);
        pdf_tuu_interp = zeros(total_time_steps_interp-1,1);

        for i=1:total_time_steps_interp-1 % NOTE the final index
        
           delta = time_steps_interp(i+1) - time_steps_interp(i);
           
           pdf_tu_interp(i) = (cdf_tu_interp(i+1) - cdf_tu_interp(i)) / delta;
           pdf_tuu_interp(i) = (cdf_tuu_interp(i+1) - cdf_tuu_interp(i)) / delta;
            
        end
        
        
        %%%

    %     figure
    %     
    %     x = time_steps(1:end-1);
    %     
    %     hold on
    %     bar(x,pdf_tuu) 
    %     bar(x,pdf_tu)
    %     hold off


        %%% now use nonlinear programming optimisation to minimise the misfit between
        %%% cdf(conv(tu, ts)) and cdf(tu+ts) (we know pdfs and cdfs for tu and tu+ts)

    %     par_tuu = [a_tuu,b_tuu];    
    %     par_tu = [a_tu,b_tu];

        % use function handle to define the function of x to be minimised

    
        %     fun_err = @(x) misfit(time_steps_fitting,par_tuu,par_tu,x); % NOTE: use time_steps (coarser steps) instead of time_steps_continuous_plotting
        
        fun = @(x) misfit(time_steps_interp,pdf_tuu_interp,pdf_tu_interp,x,0); % x is the unknown pdf_ts, with the components corresponding to the values in each bin; the 0 in the end indicates no allowance for the items stored early      
        %
        fun_extended = @(x) misfit_extended(time_steps_interp,pdf_tuu_interp,pdf_tu_interp,x); % x is the unknown pdf_ts AND the share of the items stored early, with the components corresponding to the values in each bin    

        % define initial condition for the optimisation
  
        x0 = 0.5 * (pdf_tuu_interp + pdf_tu_interp); % NOTE: this is a column; 
        %
        x0_extended = [0.5 * (pdf_tuu_interp + pdf_tu_interp); 0]; % NOTE: this is a column; initial fraction (w) of products stored immediately is set to 0

        % NOTE: these are columns
        lb = zeros(total_time_steps_interp-1,1); % pdf entries cannot be negative; the fraction (w) of products stored immediately cannot be negative
        ub = (1 / min(bin_widths_interp)) * ones(total_time_steps_interp-1,1); % higest pdf occurs when all the probability is in the smallest bin; the fraction (w) of the products stored immediately is limited to 10% (assume 90% or more are used first!) 
        %
        lb_extended = [zeros(total_time_steps_interp-1,1); 0]; % pdf entries cannot be negative; the fraction (w) of products stored immediately cannot be negative
        ub_extended = [(1 / min(bin_widths_interp)) * ones(total_time_steps_interp-1,1); 0.1]; % higest pdf occurs when all the probability is in the smallest bin; the fraction (w) of the products stored immediately is limited to 10% (assume 90% or more are used first!) 

        % normalising constraint: sum(bin_width .* pdf) = 1, which is
        % equivalent to Aeq * x = b_eq in the matrix algebra of the solver

        % NOTE: we need to convert bin widths column into row for the matrix algebra to work!!
        Aeq = transpose(bin_widths_interp); % bin widths; 
        %
        Aeq_extended = [transpose(bin_widths_interp) 0]; % bin widths; additional 0 in the end excludes the weight (w) from the normalising condition 
        
        beq = 1; % integral of the pdf over the bins which should be equal to 1
  
        
    %     % matrix multiplication checks: integrals of pdf_tu and pdf_tuu
    %     whcih should be equal to 1 and are evaluated using Aeq
    %     
    %     Aeq*pdf_tu % NOTE: use "*" for matrix multiplication
    %     Aeq*pdf_tuu

    %     options = optimset('PlotFcns',@optimplotfval);
    %     x_opt = fminsearch(fun_err,x0,options);
    
        fprintf('\n')
        text = ['running fmincon for UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"...\n'];
        fprintf(text) 
        
        fprintf('case 1: exclude items stored without being used first...\n')
        
        [x_opt, fmin] = fmincon(fun,x0,[],[],Aeq,beq,lb,ub,[],options); % NOTE the excluded fields (must be replaced with [] for the solver to work)
        
        %
        
        fprintf('case 2: include items stored without being used first...\n')
        
        [x_opt_extended, fmin_extended] = fmincon(fun_extended,x0_extended,[],[],Aeq_extended,beq,lb_extended,ub_extended,[],options); % NOTE the excluded fields (must be replaced with [] for the solver to work)
       
        fprintf('done!\n')
        
        %
        
        fprintf('min misfit no early storage = %f, min misfit with early storage = %f\n', fmin, fmin_extended)
        
        % optimal solution for x is the required pdf_ts
        
        if fmin <= fmin_extended % the case without early storage gives a better fit

            pdf_ts_interp = x_opt; 
            frac_early_storage = 0; 

        else % the case with early storage gives a better fit
            
            pdf_ts_interp = x_opt_extended(1:end-1); % all but last parameter in x_opt
            frac_early_storage = x_opt_extended(end); % last parameter in x_opt

        end
        
        
        % reconstruct the relevant cdf

        cdf_ts_interp = zeros(total_time_steps_interp,1);

        cdf_ts_interp(1) = 0; % initial value (just in case)

        for i=2:total_time_steps_interp % NOTE the starting value

            delta = time_steps_interp(i) - time_steps_interp(i-1);

            cdf_ts_interp(i) = cdf_ts_interp(i-1) + pdf_ts_interp(i-1) * delta;

        end

        %%% reconstructing optimal solution for the convolved pdf and cdf

        [~, cdf_conv_interp] = pdf_convolution(time_steps_interp,pdf_tu_interp,pdf_ts_interp);


        % make sure the dimensions are in agreement
        if size(cdf_ts_interp,1) == size(cdf_conv_interp,2) && size(cdf_ts_interp,2) == size(cdf_conv_interp,1)

            cdf_conv_interp = transpose(cdf_conv_interp);

        end

        cdf_tuurec_interp = frac_early_storage * cdf_ts_interp + (1-frac_early_storage) * cdf_conv_interp;


        %%% now create large random samples corresponding to the three pdfs and 
        %%% fit analytical distributions using the fitdist function

        %%% this requires generating large pools of numbers distributed according to the pdfs

        updated_freq_tu = floor(scaling_factor_samples * pdf_tu_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large
        updated_freq_ts = floor(scaling_factor_samples * pdf_ts_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large
        updated_freq_tuu = floor(scaling_factor_samples * pdf_tuu_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large

        for i=1:total_time_steps_interp-1 % NOTE the final index

           total_samples_tuuis_bin_tu = updated_freq_tu(i);
           total_samples_tuuis_bin_ts = updated_freq_ts(i);
           total_samples_tuuis_bin_tuu = updated_freq_tuu(i);

           uniform_rand_vector_tu = rand(total_samples_tuuis_bin_tu,1);
           uniform_rand_vector_ts = rand(total_samples_tuuis_bin_ts,1);
           uniform_rand_vector_tuu = rand(total_samples_tuuis_bin_tuu,1);

           % scale the uniform distribution with samples number proportinal to the pdf value over this bin to the width of the current bin
           delta = (time_steps_interp(i+1) - time_steps_interp(i));
           
           samples_tuuis_bin_tu = time_steps_interp(i) + delta * uniform_rand_vector_tu;
           samples_tuuis_bin_ts = time_steps_interp(i) + delta * uniform_rand_vector_ts;
           samples_tuuis_bin_tuu = time_steps_interp(i) + delta * uniform_rand_vector_tuu;

           if i==1

               all_samples_tu = samples_tuuis_bin_tu;
               all_samples_ts = samples_tuuis_bin_ts;
               all_samples_tuu = samples_tuuis_bin_tuu;

           else

               all_samples_tu = [all_samples_tu; samples_tuuis_bin_tu]; %#ok<*AGROW>
               all_samples_ts = [all_samples_ts; samples_tuuis_bin_ts];
               all_samples_tuu = [all_samples_tuu; samples_tuuis_bin_tuu];

           end

        end


    %     if k==3
    %        
    %         disp('debugging...')
    %         
    %     end


        %%% fitting Weibull distribution for tu

        dist = 'weibull';

        pd = fitdist(all_samples_tu, dist);
        a_tu = pd.a;
        b_tu = pd.b;

        %%% fitting weibull distribution for tu+ts

        pd = fitdist(all_samples_tuu, dist);
        a_tuu = pd.a;
        b_tuu = pd.b;


        %%% recording the results into arrays
        
        through_count_products = through_count_products + 1;
        
        results_early_storage_cell(through_count_products,1) = num2cell(current_unu_key); %#ok<*SAGROW>
        results_early_storage_cell(through_count_products,2) = current_product_description;
        results_early_storage_cell(through_count_products,3) = num2cell(frac_early_storage);
        
        % 
        
        aux_cell = cell(total_time_steps_interp,1);
        
        aux_cell(:,1) = num2cell(current_unu_key * ones(total_time_steps_interp,1));
        aux_cell(:,2) = current_product_description;
        %
        aux_cell(:,3) = num2cell(time_steps_interp);
        %
        aux_cell(:,4) = num2cell(cdf_tu_interp); % "used" = "at home" - "unused" data (both measured from purchase) 
        aux_cell(:,5) = num2cell(cdf_tuu_interp); % "unused" data (measured from purchase)
        %
        aux_cell(:,6) = num2cell(cdf_tuurec_interp); % "unused" reconstructed (measured from purchase)
        aux_cell(:,7) = num2cell(cdf_conv_interp); % this is tu+ts, reconstructed (measured from purchase)
        %
        aux_cell(:,8) = num2cell(cdf_ts_interp); % finally, this is ts, reconstructed (measured from end of use)
        %

            
        if through_count_products == 1
                  
            results_cdf_cell = aux_cell;  
        
        else
        
            results_cdf_cell = [results_cdf_cell; aux_cell];
            
        end
        

        %%% plotting

    %     weib_pdf_tuu = wblpdf(time_steps_continuous_plotting,a_tuu,b_tuu);
    %     weib_pdf_tu = wblpdf(time_steps_continuous_plotting,a_tu,b_tu);
    %     weib_pdf_ts = wblpdf(time_steps_continuous_plotting,a_ts,b_ts);
    %     
    %     weib_cdf_tuu = wblcdf(time_steps_continuous_plotting,a_tuu,b_tuu);
    %     weib_cdf_tu = wblcdf(time_steps_continuous_plotting,a_tu,b_tu);
    %     weib_cdf_ts = wblcdf(time_steps_continuous_plotting,a_ts,b_ts);




        %%% pdf plots

        fig_pdf = figure;
        set(fig_pdf, 'Visible', 'off');

        
        hold on

        % NOTE: scale the number of bins according to the actual data range
        % to ensure the bin width is the same for all the datasest
        default_range = time_steps_interp(end) - time_steps_interp(1);
        
        nbins = floor((total_time_steps_interp-1)*(max(all_samples_tuu) - min(all_samples_tuu))/default_range);
        %
        if nbins == 0 
            nbins = 1;
        end
        %    
        l_tuu = histogram(all_samples_tuu,nbins,'FaceColor',[1 0 0]); % red 
        
        
        nbins = floor((total_time_steps_interp-1)*(max(all_samples_tu) - min(all_samples_tu))/default_range);
        %
        if nbins == 0 
            nbins = 1;
        end
        %
        l_tu = histogram(all_samples_tu,nbins,'FaceColor',[0 0 1]); % blue
        
        
        nbins = floor((total_time_steps_interp-1)*(max(all_samples_ts) - min(all_samples_ts))/default_range);
        %
        if nbins == 0 
            nbins = 1;
        end
        %
        l_ts = histogram(all_samples_ts,nbins,'FaceColor',[0.5 0.5 0.5]); % grey

    %     l_tuu = plot(time_steps_continuous_plotting,weib_pdf_tuu,'k','Linewidth',1.5);
    %     l_tu = plot(time_steps_continuous_plotting,weib_pdf_tu,'b');
    %     l_ts = plot(time_steps_continuous_plotting,weib_pdf_ts,'m');

        hold off
        
        xlabel('time (yrs)')
        ylabel('frequency (~pdf)')

        legend([l_tuu l_tu l_ts],{'age unused (from purchase)','age used (from purchase)','time stored (from end of use)'},'Location','northeast')

        text_title = ['PDFs, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; 

        title(text_title)

        %

        text_figure = [folder '/PDFs_UNU_' num2str(current_unu_key) '_product_' num2str(p)];

        saveas(fig_pdf, [text_figure '.png'], 'png')



        %%% cumulative plots to check

        fig_cdf = figure('Position', get(0, 'Screensize'));
        set(gca,'FontSize',14)
        set(gcf,'color','w')
        set(fig_cdf, 'Visible', 'off');

        xmin = time_steps(1);
        xmax = time_steps(end);

        ymin = 0;
        ymax = 1;

        subplot(1,2,1)

        hold on

        l1 = plot(time_steps_interp,cdf_tuu_interp,'r');
        l2 = plot(time_steps_interp,cdf_tu_interp,'b');
        l3 = plot(time_steps_interp,cdf_ts_interp,'k');
    %     l22 = plot(time_steps_continuous_plotting,weib_cdf_tu,'c');   
    %     l11 = plot(time_steps_continuous_plotting,weib_cdf_tuu,'m');

        hold off

        set(gca,'xlim',[xmin xmax])
        set(gca,'ylim',[ymin ymax])
        
        xlabel('time (yrs)')
        ylabel('cdf')

    %     legend([l1 l11 l2 l21],{'TU','TU weibull','TU+TS','TU+TS weibull'},'Location','northwest')
        legend([l1 l2 l3],{'age unused (from purchase)','age used (from purchase)','time in storage (from end of use)'},'Location','southeast')

        text_subplot = ['CDFs, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; 

        title(text_subplot)

        %
        
        subplot(1,2,2)

        hold on

        l1 = plot(time_steps_interp,cdf_tuu_interp,'r');
        l2 = plot(time_steps_interp,cdf_tuurec_interp,'c'); % this is reconstructed t_home
%         l3 = plot(time_steps_interp,cdf_conv_interp,'k'); % this is reconstructed tu+ts

        hold off

        set(gca,'xlim',[xmin xmax])
        set(gca,'ylim',[ymin ymax])
        
        xlabel('time (yrs)')
        ylabel('cdf')

        legend([l1 l2],{'age unused data','age unused reconstr'},'Location','northwest')

        text_subplot = ['CDFs: data vs reconstr, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; 

        title(text_subplot)

        %

        text_figure = [folder '/CDFs_UNU_' num2str(current_unu_key) '_product_' num2str(p)];

        saveas(fig_cdf, [text_figure '.png'], 'png')
    
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        
        %%% closing the loop over products within a given unu key
        
    end
    
    %%% closing the loop over unu keys
    
end


%%% writing the results in files

labels_early_storage = {'UNU', 'Product', 'frac_early_storage'};

matrix_early_storage = [labels_early_storage; results_early_storage_cell];

sheet = 'early_storage';
xlswrite(filename_output,matrix_early_storage,sheet)

%

labels_cdf = {'UNU', 'Product', 'time points', 'cdf_tu (age used, data)', 'cdf_tuu (age unused, data)',...
    'cdf_tuurec (age unused, reconstr)', 'cdf_conv (tu+ts, reconstr)', 'cdf_ts (time stored, reconstr)'};

matrix_cdf = [labels_cdf; results_cdf_cell];

sheet = 'cdfs';
xlswrite(filename_output,matrix_cdf,sheet)