%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% %%% Reconstructing TS (time in storage) pdf based on pdfs of TU (time in %%% use) and TH (time at home) specified as frequencies over discrete bins %%% %%% Dmitry Yumashev, 15/11/2019 %%% Dmitry Yumashev, 19/11/2019 %%% Dmitry Yumashev, 20/11/2019 %%% Dmitry Yumashev, 21/11/2019 %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% clear all %#ok<*CLALL> %%% general settings step_fitting = 0.5; % required new timestep after re-binning scaling_factor_samples = 1e+6; % this needs to be large to improve the accuracy of fitting %%% options for the fmincon solver (nonlinear programming) options = optimoptions(@fmincon,'Algorithm','interior-point','OptimalityTolerance',1e-10,'StepTolerance',1e-12,'MaxFunctionEvaluations',1e+4,'MaxIterations',1e+4,'PlotFcn','optimplotx','UseParallel',1); %%% input and outpiut files folder = 'time'; filename_output = [folder '/all_cdfs.xlsx']; filename_tu = [folder '/TU_freq2.csv']; % this is age of used devices measured from the year of purchase! filename_tuu = [folder '/TUU_freq2.csv']; % this is age of unused devices measured from the year of purchase! %%% read the data sample [nums_tu, ~, raw_tu] = xlsread(filename_tu); [nums_tuu, ~, raw_tuu] = xlsread(filename_tuu); %%% all_unu_keys = nums_tu(:,3); unu_keys = unique(all_unu_keys); total_unu_keys = length(unu_keys); % all_product_descriptions = raw_tu(2:end,4); % NOTE: this includes the label in the first row, so we need to offset it! %%% manually enter the time steps time_steps = transpose([0 1 3 6 10 15 20 40]); % define as a column total_time_steps = length(time_steps); % bin_widths = time_steps(2:end) - time_steps(1:end-1); % all_freq_tu = nums_tu(:,6); all_freq_tuu = nums_tuu(:,6); % time_steps_interp = transpose(time_steps(1):step_fitting:time_steps(end)); % define as a column total_time_steps_interp = length(time_steps_interp); % bin_widths_interp = time_steps_interp(2:end) - time_steps_interp(1:end-1); % time_steps_continuous_plotting = transpose(time_steps(1):0.1:time_steps(end)); % define as a column %%% through_count_products = 0; for k = 1:total_unu_keys current_unu_key = unu_keys(k); current_rows = find(all_unu_keys == current_unu_key); % % if k==2 % % disp('debugging...') % % end product_descriptions = unique(all_product_descriptions(current_rows)); for p=1:length(product_descriptions) % some UNU keys have more than one product current_product_description = product_descriptions(p); % update the selection of rows to include the selected product % within the given unu key (subset of the initial selection of % rows) current_rows = find((all_unu_keys == current_unu_key) .* (strcmp(all_product_descriptions, current_product_description) == 1)); %%% freq_tu = all_freq_tu(current_rows); freq_tuu = all_freq_tuu(current_rows); %%% reconstructing cdfs cdf_tu = zeros(total_time_steps,1); cdf_tuu = zeros(total_time_steps,1); cdf_tu(1) = 0; cdf_tuu(1) = 0; for i=2:total_time_steps delta = time_steps(i) - time_steps(i-1); cdf_tu(i) = cdf_tu(i-1) + freq_tu(i-1) * delta; cdf_tuu(i) = cdf_tuu(i-1) + freq_tuu(i-1) * delta; end % normalising to 1 and reconstructing pdfs in the process scale_tu = cdf_tu(end); % cdf_tu = cdf_tu / scale_tu; pdf_tu = freq_tu / scale_tu; scale_tuu = cdf_tuu(end); % cdf_tuu = cdf_tuu / scale_tuu; pdf_tuu = freq_tuu / scale_tuu; %%% now interpolate the cdfs using the specified step for the %%% fitting cdf_tu_interp = interp1(time_steps, cdf_tu, time_steps_interp, 'linear'); cdf_tuu_interp = interp1(time_steps, cdf_tuu, time_steps_interp, 'linear'); pdf_tu_interp = zeros(total_time_steps_interp-1,1); pdf_tuu_interp = zeros(total_time_steps_interp-1,1); for i=1:total_time_steps_interp-1 % NOTE the final index delta = time_steps_interp(i+1) - time_steps_interp(i); pdf_tu_interp(i) = (cdf_tu_interp(i+1) - cdf_tu_interp(i)) / delta; pdf_tuu_interp(i) = (cdf_tuu_interp(i+1) - cdf_tuu_interp(i)) / delta; end %%% % figure % % x = time_steps(1:end-1); % % hold on % bar(x,pdf_tuu) % bar(x,pdf_tu) % hold off %%% now use nonlinear programming optimisation to minimise the misfit between %%% cdf(conv(tu, ts)) and cdf(tu+ts) (we know pdfs and cdfs for tu and tu+ts) % par_tuu = [a_tuu,b_tuu]; % par_tu = [a_tu,b_tu]; % use function handle to define the function of x to be minimised % fun_err = @(x) misfit(time_steps_fitting,par_tuu,par_tu,x); % NOTE: use time_steps (coarser steps) instead of time_steps_continuous_plotting fun = @(x) misfit(time_steps_interp,pdf_tuu_interp,pdf_tu_interp,x,0); % x is the unknown pdf_ts, with the components corresponding to the values in each bin; the 0 in the end indicates no allowance for the items stored early % fun_extended = @(x) misfit_extended(time_steps_interp,pdf_tuu_interp,pdf_tu_interp,x); % x is the unknown pdf_ts AND the share of the items stored early, with the components corresponding to the values in each bin % define initial condition for the optimisation x0 = 0.5 * (pdf_tuu_interp + pdf_tu_interp); % NOTE: this is a column; % x0_extended = [0.5 * (pdf_tuu_interp + pdf_tu_interp); 0]; % NOTE: this is a column; initial fraction (w) of products stored immediately is set to 0 % NOTE: these are columns lb = zeros(total_time_steps_interp-1,1); % pdf entries cannot be negative; the fraction (w) of products stored immediately cannot be negative ub = (1 / min(bin_widths_interp)) * ones(total_time_steps_interp-1,1); % higest pdf occurs when all the probability is in the smallest bin; the fraction (w) of the products stored immediately is limited to 10% (assume 90% or more are used first!) % lb_extended = [zeros(total_time_steps_interp-1,1); 0]; % pdf entries cannot be negative; the fraction (w) of products stored immediately cannot be negative ub_extended = [(1 / min(bin_widths_interp)) * ones(total_time_steps_interp-1,1); 0.1]; % higest pdf occurs when all the probability is in the smallest bin; the fraction (w) of the products stored immediately is limited to 10% (assume 90% or more are used first!) % normalising constraint: sum(bin_width .* pdf) = 1, which is % equivalent to Aeq * x = b_eq in the matrix algebra of the solver % NOTE: we need to convert bin widths column into row for the matrix algebra to work!! Aeq = transpose(bin_widths_interp); % bin widths; % Aeq_extended = [transpose(bin_widths_interp) 0]; % bin widths; additional 0 in the end excludes the weight (w) from the normalising condition beq = 1; % integral of the pdf over the bins which should be equal to 1 % % matrix multiplication checks: integrals of pdf_tu and pdf_tuu % whcih should be equal to 1 and are evaluated using Aeq % % Aeq*pdf_tu % NOTE: use "*" for matrix multiplication % Aeq*pdf_tuu % options = optimset('PlotFcns',@optimplotfval); % x_opt = fminsearch(fun_err,x0,options); fprintf('\n') text = ['running fmincon for UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"...\n']; fprintf(text) fprintf('case 1: exclude items stored without being used first...\n') [x_opt, fmin] = fmincon(fun,x0,[],[],Aeq,beq,lb,ub,[],options); % NOTE the excluded fields (must be replaced with [] for the solver to work) % fprintf('case 2: include items stored without being used first...\n') [x_opt_extended, fmin_extended] = fmincon(fun_extended,x0_extended,[],[],Aeq_extended,beq,lb_extended,ub_extended,[],options); % NOTE the excluded fields (must be replaced with [] for the solver to work) fprintf('done!\n') % fprintf('min misfit no early storage = %f, min misfit with early storage = %f\n', fmin, fmin_extended) % optimal solution for x is the required pdf_ts if fmin <= fmin_extended % the case without early storage gives a better fit pdf_ts_interp = x_opt; frac_early_storage = 0; else % the case with early storage gives a better fit pdf_ts_interp = x_opt_extended(1:end-1); % all but last parameter in x_opt frac_early_storage = x_opt_extended(end); % last parameter in x_opt end % reconstruct the relevant cdf cdf_ts_interp = zeros(total_time_steps_interp,1); cdf_ts_interp(1) = 0; % initial value (just in case) for i=2:total_time_steps_interp % NOTE the starting value delta = time_steps_interp(i) - time_steps_interp(i-1); cdf_ts_interp(i) = cdf_ts_interp(i-1) + pdf_ts_interp(i-1) * delta; end %%% reconstructing optimal solution for the convolved pdf and cdf [~, cdf_conv_interp] = pdf_convolution(time_steps_interp,pdf_tu_interp,pdf_ts_interp); % make sure the dimensions are in agreement if size(cdf_ts_interp,1) == size(cdf_conv_interp,2) && size(cdf_ts_interp,2) == size(cdf_conv_interp,1) cdf_conv_interp = transpose(cdf_conv_interp); end cdf_tuurec_interp = frac_early_storage * cdf_ts_interp + (1-frac_early_storage) * cdf_conv_interp; %%% now create large random samples corresponding to the three pdfs and %%% fit analytical distributions using the fitdist function %%% this requires generating large pools of numbers distributed according to the pdfs updated_freq_tu = floor(scaling_factor_samples * pdf_tu_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large updated_freq_ts = floor(scaling_factor_samples * pdf_ts_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large updated_freq_tuu = floor(scaling_factor_samples * pdf_tuu_interp); % rounding down is necessary to generate integers; the error is small if the scaling factor is large for i=1:total_time_steps_interp-1 % NOTE the final index total_samples_tuuis_bin_tu = updated_freq_tu(i); total_samples_tuuis_bin_ts = updated_freq_ts(i); total_samples_tuuis_bin_tuu = updated_freq_tuu(i); uniform_rand_vector_tu = rand(total_samples_tuuis_bin_tu,1); uniform_rand_vector_ts = rand(total_samples_tuuis_bin_ts,1); uniform_rand_vector_tuu = rand(total_samples_tuuis_bin_tuu,1); % scale the uniform distribution with samples number proportinal to the pdf value over this bin to the width of the current bin delta = (time_steps_interp(i+1) - time_steps_interp(i)); samples_tuuis_bin_tu = time_steps_interp(i) + delta * uniform_rand_vector_tu; samples_tuuis_bin_ts = time_steps_interp(i) + delta * uniform_rand_vector_ts; samples_tuuis_bin_tuu = time_steps_interp(i) + delta * uniform_rand_vector_tuu; if i==1 all_samples_tu = samples_tuuis_bin_tu; all_samples_ts = samples_tuuis_bin_ts; all_samples_tuu = samples_tuuis_bin_tuu; else all_samples_tu = [all_samples_tu; samples_tuuis_bin_tu]; %#ok<*AGROW> all_samples_ts = [all_samples_ts; samples_tuuis_bin_ts]; all_samples_tuu = [all_samples_tuu; samples_tuuis_bin_tuu]; end end % if k==3 % % disp('debugging...') % % end %%% fitting Weibull distribution for tu dist = 'weibull'; pd = fitdist(all_samples_tu, dist); a_tu = pd.a; b_tu = pd.b; %%% fitting weibull distribution for tu+ts pd = fitdist(all_samples_tuu, dist); a_tuu = pd.a; b_tuu = pd.b; %%% recording the results into arrays through_count_products = through_count_products + 1; results_early_storage_cell(through_count_products,1) = num2cell(current_unu_key); %#ok<*SAGROW> results_early_storage_cell(through_count_products,2) = current_product_description; results_early_storage_cell(through_count_products,3) = num2cell(frac_early_storage); % aux_cell = cell(total_time_steps_interp,1); aux_cell(:,1) = num2cell(current_unu_key * ones(total_time_steps_interp,1)); aux_cell(:,2) = current_product_description; % aux_cell(:,3) = num2cell(time_steps_interp); % aux_cell(:,4) = num2cell(cdf_tu_interp); % "used" = "at home" - "unused" data (both measured from purchase) aux_cell(:,5) = num2cell(cdf_tuu_interp); % "unused" data (measured from purchase) % aux_cell(:,6) = num2cell(cdf_tuurec_interp); % "unused" reconstructed (measured from purchase) aux_cell(:,7) = num2cell(cdf_conv_interp); % this is tu+ts, reconstructed (measured from purchase) % aux_cell(:,8) = num2cell(cdf_ts_interp); % finally, this is ts, reconstructed (measured from end of use) % if through_count_products == 1 results_cdf_cell = aux_cell; else results_cdf_cell = [results_cdf_cell; aux_cell]; end %%% plotting % weib_pdf_tuu = wblpdf(time_steps_continuous_plotting,a_tuu,b_tuu); % weib_pdf_tu = wblpdf(time_steps_continuous_plotting,a_tu,b_tu); % weib_pdf_ts = wblpdf(time_steps_continuous_plotting,a_ts,b_ts); % % weib_cdf_tuu = wblcdf(time_steps_continuous_plotting,a_tuu,b_tuu); % weib_cdf_tu = wblcdf(time_steps_continuous_plotting,a_tu,b_tu); % weib_cdf_ts = wblcdf(time_steps_continuous_plotting,a_ts,b_ts); %%% pdf plots fig_pdf = figure; set(fig_pdf, 'Visible', 'off'); hold on % NOTE: scale the number of bins according to the actual data range % to ensure the bin width is the same for all the datasest default_range = time_steps_interp(end) - time_steps_interp(1); nbins = floor((total_time_steps_interp-1)*(max(all_samples_tuu) - min(all_samples_tuu))/default_range); % if nbins == 0 nbins = 1; end % l_tuu = histogram(all_samples_tuu,nbins,'FaceColor',[1 0 0]); % red nbins = floor((total_time_steps_interp-1)*(max(all_samples_tu) - min(all_samples_tu))/default_range); % if nbins == 0 nbins = 1; end % l_tu = histogram(all_samples_tu,nbins,'FaceColor',[0 0 1]); % blue nbins = floor((total_time_steps_interp-1)*(max(all_samples_ts) - min(all_samples_ts))/default_range); % if nbins == 0 nbins = 1; end % l_ts = histogram(all_samples_ts,nbins,'FaceColor',[0.5 0.5 0.5]); % grey % l_tuu = plot(time_steps_continuous_plotting,weib_pdf_tuu,'k','Linewidth',1.5); % l_tu = plot(time_steps_continuous_plotting,weib_pdf_tu,'b'); % l_ts = plot(time_steps_continuous_plotting,weib_pdf_ts,'m'); hold off xlabel('time (yrs)') ylabel('frequency (~pdf)') legend([l_tuu l_tu l_ts],{'age unused (from purchase)','age used (from purchase)','time stored (from end of use)'},'Location','northeast') text_title = ['PDFs, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; title(text_title) % text_figure = [folder '/PDFs_UNU_' num2str(current_unu_key) '_product_' num2str(p)]; saveas(fig_pdf, [text_figure '.png'], 'png') %%% cumulative plots to check fig_cdf = figure('Position', get(0, 'Screensize')); set(gca,'FontSize',14) set(gcf,'color','w') set(fig_cdf, 'Visible', 'off'); xmin = time_steps(1); xmax = time_steps(end); ymin = 0; ymax = 1; subplot(1,2,1) hold on l1 = plot(time_steps_interp,cdf_tuu_interp,'r'); l2 = plot(time_steps_interp,cdf_tu_interp,'b'); l3 = plot(time_steps_interp,cdf_ts_interp,'k'); % l22 = plot(time_steps_continuous_plotting,weib_cdf_tu,'c'); % l11 = plot(time_steps_continuous_plotting,weib_cdf_tuu,'m'); hold off set(gca,'xlim',[xmin xmax]) set(gca,'ylim',[ymin ymax]) xlabel('time (yrs)') ylabel('cdf') % legend([l1 l11 l2 l21],{'TU','TU weibull','TU+TS','TU+TS weibull'},'Location','northwest') legend([l1 l2 l3],{'age unused (from purchase)','age used (from purchase)','time in storage (from end of use)'},'Location','southeast') text_subplot = ['CDFs, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; title(text_subplot) % subplot(1,2,2) hold on l1 = plot(time_steps_interp,cdf_tuu_interp,'r'); l2 = plot(time_steps_interp,cdf_tuurec_interp,'c'); % this is reconstructed t_home % l3 = plot(time_steps_interp,cdf_conv_interp,'k'); % this is reconstructed tu+ts hold off set(gca,'xlim',[xmin xmax]) set(gca,'ylim',[ymin ymax]) xlabel('time (yrs)') ylabel('cdf') legend([l1 l2],{'age unused data','age unused reconstr'},'Location','northwest') text_subplot = ['CDFs: data vs reconstr, UNU=' num2str(current_unu_key) ' "' char(current_product_description) '"']; title(text_subplot) % text_figure = [folder '/CDFs_UNU_' num2str(current_unu_key) '_product_' num2str(p)]; saveas(fig_cdf, [text_figure '.png'], 'png') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% closing the loop over products within a given unu key end %%% closing the loop over unu keys end %%% writing the results in files labels_early_storage = {'UNU', 'Product', 'frac_early_storage'}; matrix_early_storage = [labels_early_storage; results_early_storage_cell]; sheet = 'early_storage'; xlswrite(filename_output,matrix_early_storage,sheet) % labels_cdf = {'UNU', 'Product', 'time points', 'cdf_tu (age used, data)', 'cdf_tuu (age unused, data)',... 'cdf_tuurec (age unused, reconstr)', 'cdf_conv (tu+ts, reconstr)', 'cdf_ts (time stored, reconstr)'}; matrix_cdf = [labels_cdf; results_cdf_cell]; sheet = 'cdfs'; xlswrite(filename_output,matrix_cdf,sheet)